gentropy/Makefile at dev · opentargets/gentropy · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
SHELL := /bin/bash
PROJECT_ID ?= open-targets-genetics-dev
REGION ?= europe-west1
APP_NAME ?= $$(cat pyproject.toml | grep -m 1 "name" | cut -d" " -f3 | sed  's/"//g')
PACKAGE_VERSION ?= $(shell grep -m 1 'version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/')
USER_SAFE ?= $(shell echo $(USER) | tr '[:upper:]' '[:lower:]')
CLUSTER_TIMEOUT ?= 60m
# NOTE: git rev-parse will always return the HEAD if it sits in the tag,
# this way we can distinguish the tag vs branch name
ifeq ($(shell git rev-parse --abbrev-ref HEAD),HEAD)
	REF ?= $(shell git describe --exact-match --tags)
else
	REF ?= $(shell git rev-parse --abbrev-ref HEAD)
endif

CLEAN_PACKAGE_VERSION := $(shell echo "$(PACKAGE_VERSION)" | tr -cd '[:alnum:]')
BUCKET_NAME=gs://genetics_etl_python_playground/initialisation

.PHONY: $(shell sed -n -e '/^$$/ { n ; /^[^ .\#][^ ]*:/ { s/:.*$$// ; p ; } ; }' $(MAKEFILE_LIST))

.DEFAULT_GOAL := help

help: ## This is help
	@awk 'BEGIN {FS = ":.*?## "} /^[a-zA-Z_-]+:.*?## / {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' $(MAKEFILE_LIST)

clean: ## Clean up prior to building
	@rm -Rf ./dist

setup-dev: SHELL := $(shell echo $${SHELL})
setup-dev:  ## Setup development environment
	@. utils/install_dependencies.sh
	@echo "Run . ${HOME}/.$(notdir $(SHELL))rc to finish setup"

check: ## Lint and format code
	@echo "Linting API..."
	@uv run ruff check src/gentropy .
	@echo "Linting docstrings..."
	@uv run pydoclint --config=pyproject.toml src
	@uv run pydoclint --config=pyproject.toml --skip-checking-short-docstrings=true tests

test-no-shared-spark-session: ## Run tests that can not rely on shared SparkSession.
	@echo "Running tests that can not rely on shared SparkSession fixture..."
	@COVERAGE_FILE=.coverage.no_shared_spark uv run pytest -m "no_shared_spark and not download_jars_from_web" -n0 --cov-report=

test-shared-spark-session: ## Run tests that can use shared SparkSession fixture.
	@echo "Running tests that can share SparkSession fixture..."
	@COVERAGE_FILE=.coverage.shared_spark uv run pytest --cov-report=

test-no-shared-spark-session-web-dependencies: ## Run tests that require to download spark dependency jars from the web (not run by default).
	@echo "Running tests that can not rely on shared SparkSession and require downloading jar dependencies from web..."
	@COVERAGE_FILE=.coverage.no_shared_spark_web_deps uv run pytest -n0 -m "download_jars_from_web" --cov-report=

test: test-no-shared-spark-session test-shared-spark-session ## Run default test suite
	@uv run coverage combine .coverage.shared_spark .coverage.no_shared_spark
	@uv run coverage xml
	@rm -f .coverage.shared_spark .coverage.no_shared_spark

build-documentation: ## Create local server with documentation
	@echo "Building Documentation..."
	@uv run mkdocs serve

sync-cluster-init-script: ## Synchronize the cluster inicialisation actions script to google cloud
	@echo "Syncing install_dependencies_on_cluster.sh to ${BUCKET_NAME}"
	@gcloud storage cp utils/install_dependencies_on_cluster.sh ${BUCKET_NAME}/install_dependencies_on_cluster.sh

sync-gentropy-cli-script: ## Synchronize the gentropy cli script
	@echo "Syncing gentropy cli script to ${BUCKET_NAME}"
	@gcloud storage cp src/gentropy/cli.py ${BUCKET_NAME}/cli.py

create-dev-cluster: sync-cluster-init-script sync-gentropy-cli-script ## Spin up a simple dataproc cluster with all dependencies for development purposes
	@echo "Making sure the cluster can reference to ${REF} branch to install gentropy..."
	@./utils/clean_status.sh ${REF} || (echo "ERROR: Commit and push local changes, to have up to date cluster"; exit 1)
	@echo "Creating Dataproc Dev Cluster"
	gcloud config set project ${PROJECT_ID}
	gcloud dataproc clusters create "ot-genetics-dev-${CLEAN_PACKAGE_VERSION}-$(USER_SAFE)" \
		--image-version 2.2 \
		--region ${REGION} \
		--master-machine-type n1-standard-16 \
		--metadata="GENTROPY_REF=${REF}" \
		--initialization-actions=${BUCKET_NAME}/install_dependencies_on_cluster.sh \
		--secondary-worker-type spot \
		--worker-machine-type n1-standard-16 \
		--public-ip-address \
		--worker-boot-disk-size 500 \
		--autoscaling-policy="projects/${PROJECT_ID}/regions/${REGION}/autoscalingPolicies/otg-etl" \
		--optional-components=JUPYTER \
		--enable-component-gateway \
		--labels team=open-targets,subteam=gentropy,created_by=${USER_SAFE},environment=development, \
		--max-idle=${CLUSTER_TIMEOUT}

update-dev-cluster: build ## Reinstalls the package on the dev-cluster
	@echo "Updating Dataproc Dev Cluster"
	@gcloud config set project ${PROJECT_ID}
	gcloud dataproc jobs submit pig --cluster="ot-genetics-dev-${CLEAN_PACKAGE_VERSION}" \
		--region ${REGION} \
		--jars=${BUCKET_NAME}/install_dependencies_on_cluster.sh \
		-e='sh chmod 750 $${PWD}/install_dependencies_on_cluster.sh; sh $${PWD}/install_dependencies_on_cluster.sh'

build: clean ## Build Python package with dependencies
	@uv build

build-docker: ## Build docker container locally
	docker build -t $(APP_NAME):$(CLEAN_PACKAGE_VERSION) -f Dockerfile .