broadinstitute · sagehen03 · Apr 18, 2023 · Apr 18, 2023 · sagehen03 · Apr 18, 2023
diff --git a/bioindex/src/main/resources/compression.sh b/bioindex/src/main/resources/compression.sh
@@ -0,0 +1,3 @@
+#!/bin/bash -xe
+
+sudo aws s3 cp s3://dig-data-registry/hail.jar /usr/lib/spark/jars/
diff --git a/bioindex/src/main/resources/geneExpression.py b/bioindex/src/main/resources/geneExpression.py
@@ -16,6 +16,7 @@ def main():
     # sort and write
     df.orderBy(['gene']) \
         .write \
+        .option("compression", "is.hail.io.compress.BGzipCodec") \
         .mode('overwrite') \
         .json(outdir)
 

diff --git a/bioindex/src/main/scala/GeneExpression.scala → .../src/main/scala/GeneExpressionStage.scala b/bioindex/src/main/scala/GeneExpression.scala → .../src/main/scala/GeneExpressionStage.scala
@@ -1,8 +1,6 @@
 package org.broadinstitute.dig.aggregator.methods.bioindex
 
 import org.broadinstitute.dig.aggregator.core._
-import org.broadinstitute.dig.aws._
-import org.broadinstitute.dig.aws.config.emr._
 import org.broadinstitute.dig.aws.emr._
 
 /** The final result of all aggregator methods is building the BioIndex. All
@@ -21,7 +19,8 @@ class GeneExpressionStage(implicit context: Context) extends Stage {
 
   /** Use latest EMR release. */
   override val cluster: ClusterDef = super.cluster.copy(
-    releaseLabel = ReleaseLabel.emrLatest
+    releaseLabel = ReleaseLabel.emrLatest,
+    bootstrapScripts = Seq(new BootstrapScript(resourceUri("compression.sh")))
   )
 
   /** Output to Job steps. */
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash -xe

		sudo aws s3 cp s3://dig-data-registry/hail.jar /usr/lib/spark/jars/
Copy link Contributor Author sagehen03 Apr 18, 2023 • edited Loading Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. I built the hail.jar from the hail source (it needs to compiled using java 8 since that's what our EMR clusters use) and then uploaded it to S3. We probably need a better location in s3, but I used this one now since it's not in production use. This solution also relies on EMR continuing to put /usr/lib/spark/jars on the classpath. Copy link Contributor psmadbec Apr 18, 2023 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. So for things like this we tend to use `s3://dig-aggregator-data/bin/`