44import com .google .gson .JsonArray ;
55import com .google .gson .JsonElement ;
66import com .google .gson .JsonParser ;
7- import java .util .ArrayList ;
7+ import java .util .Arrays ;
88import java .util .List ;
99import org .apache .beam .sdk .Pipeline ;
1010import org .apache .beam .sdk .io .gcp .spanner .SpannerWriteResult ;
@@ -70,12 +70,6 @@ public static void buildPipeline(
7070 Pipeline pipeline , IngestionPipelineOptions options , SpannerClient spannerClient ) {
7171 LOGGER .info ("Running import pipeline for imports: {}" , options .getImportList ());
7272
73- // Initialize lists to hold mutations from all imports.
74- List <PCollection <Void >> deleteOpsList = new ArrayList <>();
75- List <PCollection <Mutation >> obsMutationList = new ArrayList <>();
76- List <PCollection <Mutation >> edgeMutationList = new ArrayList <>();
77- List <PCollection <Mutation >> nodeMutationList = new ArrayList <>();
78-
7973 // Parse the input import list JSON.
8074 JsonElement jsonElement = JsonParser .parseString (options .getImportList ());
8175 JsonArray jsonArray = jsonElement .getAsJsonArray ();
@@ -97,37 +91,8 @@ public static void buildPipeline(
9791 String graphPath = pathElement .getAsString ();
9892
9993 // Process the individual import.
100- processImport (
101- pipeline ,
102- spannerClient ,
103- importName ,
104- graphPath ,
105- options .getSkipDelete (),
106- deleteOpsList ,
107- nodeMutationList ,
108- edgeMutationList ,
109- obsMutationList );
110- }
111- // Finally, aggregate all collected mutations and write them to Spanner.
112- // 1. Process Deletes:
113- // First, execute all delete mutations to clear old data for the imports.
114- PCollection <Void > deleted =
115- PCollectionList .of (deleteOpsList ).apply ("DeleteOps" , Flatten .pCollections ());
116-
117- // 2. Process Observations:
118- // Write observation mutations after deletes are complete.
119- if (options .getWriteObsGraph ()) {
120- spannerClient .writeMutations (pipeline , "Observations" , obsMutationList , deleted );
94+ processImport (pipeline , spannerClient , importName , graphPath , options .getSkipDelete ());
12195 }
122-
123- // 3. Process Nodes:
124- // Write node mutations after deletes are complete.
125- SpannerWriteResult writtenNodes =
126- spannerClient .writeMutations (pipeline , "Nodes" , nodeMutationList , deleted );
127-
128- // 4. Process Edges:
129- // Write edge mutations only after node mutations are complete to ensure referential integrity.
130- spannerClient .writeMutations (pipeline , "Edges" , edgeMutationList , writtenNodes .getOutput ());
13196 }
13297
13398 /**
@@ -138,31 +103,26 @@ public static void buildPipeline(
138103 * @param importName The name of the import.
139104 * @param graphPath The full path to the graph data.
140105 * @param skipDelete Whether to skip delete operations.
141- * @param deleteOpsList List to collect delete Ops.
142- * @param nodeMutationList List to collect node mutations.
143- * @param edgeMutationList List to collect edge mutations.
144- * @param obsMutationList List to collect observation mutations.
145106 */
146107 private static void processImport (
147108 Pipeline pipeline ,
148109 SpannerClient spannerClient ,
149110 String importName ,
150111 String graphPath ,
151- boolean skipDelete ,
152- List <PCollection <Void >> deleteOpsList ,
153- List <PCollection <Mutation >> nodeMutationList ,
154- List <PCollection <Mutation >> edgeMutationList ,
155- List <PCollection <Mutation >> obsMutationList ) {
112+ boolean skipDelete ) {
156113 LOGGER .info ("Import: {} Graph path: {}" , importName , graphPath );
157114
158115 String provenance = "dc/base/" + importName ;
159116
160117 // 1. Prepare Deletes:
161118 // Generate mutations to delete existing data for this import/provenance.
119+ // Create a dummy signal if deletes are skipped, so downstream dependencies are satisfied
120+ // immediately.
121+ PCollection <Void > deleteObsWait = null ;
122+ PCollection <Void > deleteEdgesWait = null ;
162123 if (!skipDelete ) {
163- List <PCollection <Void >> deleteOps =
164- GraphReader .deleteExistingDataForImport (importName , provenance , pipeline , spannerClient );
165- deleteOpsList .addAll (deleteOps );
124+ deleteObsWait = spannerClient .deleteObservationsForImport (importName , pipeline );
125+ deleteEdgesWait = spannerClient .deleteEdgesForImport (provenance , pipeline );
166126 }
167127
168128 // 2. Read and Split Graph:
@@ -176,29 +136,50 @@ private static void processImport(
176136 PCollection <McfGraph > schemaNodes = graphNodes .get (PipelineUtils .SCHEMA_NODES_TAG );
177137
178138 // 3. Process Schema Nodes:
179- // Combine schema nodes if required, then convert to Node and Edge mutations .
139+ // Combine/Deduplicate nodes if required.
180140 PCollection <McfGraph > combinedGraph = schemaNodes ;
181141 if (IMPORTS_TO_COMBINE .contains (importName )) {
182142 combinedGraph = PipelineUtils .combineGraphNodes (schemaNodes );
183143 }
144+
145+ // Convert all nodes to mutations
184146 PCollection <Mutation > nodeMutations =
185147 GraphReader .graphToNodes (
186- importName , combinedGraph , spannerClient , nodeCounter , nodeInvalidTypeCounter )
148+ "NodeMutations-" + importName ,
149+ combinedGraph ,
150+ spannerClient ,
151+ nodeCounter ,
152+ nodeInvalidTypeCounter )
187153 .apply ("ExtractNodeMutations-" + importName , Values .create ());
188154 PCollection <Mutation > edgeMutations =
189- GraphReader .graphToEdges (importName , combinedGraph , provenance , spannerClient , edgeCounter )
155+ GraphReader .graphToEdges (
156+ "EdgeMutations-" + importName ,
157+ combinedGraph ,
158+ provenance ,
159+ spannerClient ,
160+ edgeCounter )
190161 .apply ("ExtractEdgeMutations-" + importName , Values .create ());
191162
192- nodeMutationList .add (nodeMutations );
193- edgeMutationList .add (edgeMutations );
163+ // Write Nodes (wait for delete)
164+ SpannerWriteResult writtenNodes =
165+ spannerClient .writeMutations (pipeline , "Nodes-" + importName , List .of (nodeMutations ), null );
166+
167+ PCollection <Void > writeEdgesWait =
168+ PCollectionList .of (Arrays .asList (writtenNodes .getOutput (), deleteEdgesWait ))
169+ .apply ("FlattenDeleteOps-" + importName , Flatten .pCollections ());
170+ // Write Edges (wait for Nodes)
171+ spannerClient .writeMutations (
172+ pipeline , "Edges-" + importName , List .of (edgeMutations ), writeEdgesWait );
194173
195174 // 4. Process Observation Nodes:
196175 // Build an optimized graph from observation nodes and convert to Observation mutations.
197176 PCollection <McfOptimizedGraph > optimizedGraph =
198177 PipelineUtils .buildOptimizedMcfGraph (observationNodes );
199178 PCollection <Mutation > observationMutations =
200179 GraphReader .graphToObservations (optimizedGraph , importName , spannerClient , obsCounter )
201- .apply ("ExtractObsMutations" , Values .create ());
202- obsMutationList .add (observationMutations );
180+ .apply ("ExtractObsMutations-" + importName , Values .create ());
181+ // Write Observations (wait for delete)
182+ spannerClient .writeMutations (
183+ pipeline , "Observations-" + importName , List .of (observationMutations ), deleteObsWaitt );
203184 }
204185}
0 commit comments