Skip to content

Commit 6b4b356

Browse files
authored
Merge pull request #11 from VACLab/performance_improvement
improved cohort creation performance along with minor performance improvement in get concept hierarchy
2 parents 56a23fd + ba1f0c0 commit 6b4b356

File tree

2 files changed

+27
-27
lines changed

2 files changed

+27
-27
lines changed

biasanalyzer/cohort.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from sqlalchemy.exc import SQLAlchemyError
22
import duckdb
3+
import pandas as pd
34
from datetime import datetime
45
from pydantic import ValidationError
56
from biasanalyzer.models import CohortDefinition, Cohort
@@ -110,14 +111,10 @@ def create_cohort(self, cohort_name: str, description: str, query_or_yaml_file:
110111
cohort_def_id = self.bias_db.create_cohort_definition(cohort_def)
111112

112113
# Store cohort_definition and cohort data into BiasDatabase
113-
for row in result:
114-
cohort = Cohort(
115-
subject_id=int(row['person_id']), # Assuming person_id column in the result
116-
cohort_definition_id=cohort_def_id,
117-
cohort_start_date=row['cohort_start_date'],
118-
cohort_end_date=row['cohort_end_date']
119-
)
120-
self.bias_db.create_cohort(cohort)
114+
cohort_df = pd.DataFrame(result)
115+
cohort_df['cohort_definition_id'] = cohort_def_id
116+
cohort_df = cohort_df.rename(columns={"person_id": "subject_id"})
117+
self.bias_db.create_cohort_in_bulk(cohort_df)
121118
print(f"Cohort {cohort_name} successfully created.")
122119
return CohortData(cohort_id=cohort_def_id, bias_db=self.bias_db, omop_db=self.omop_db)
123120
except duckdb.Error as e:

biasanalyzer/database.py

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,15 @@ def create_cohort(self, cohort: Cohort):
119119
cohort.cohort_end_date
120120
))
121121

122+
# Method to insert cohort data in bulk from a dataframe
123+
def create_cohort_in_bulk(self, cohort_df: pd.DataFrame):
124+
# make duckdb to treat cohort_df dataframe as a virtual table named "cohort_df"
125+
self.conn.register("cohort_df", cohort_df)
126+
self.conn.execute('''
127+
INSERT INTO cohort (subject_id, cohort_definition_id, cohort_start_date, cohort_end_date)
128+
SELECT subject_id, cohort_definition_id, cohort_start_date, cohort_end_date FROM cohort_df
129+
''')
130+
122131
def get_cohort_definition(self, cohort_definition_id):
123132
results = self.conn.execute(f'''
124133
SELECT id, name, description, created_date, creation_info, created_by FROM cohort_definition
@@ -417,12 +426,8 @@ def get_concept_hierarchy(self, concept_id: int):
417426

418427
results = self.execute_query(query, params={"concept_id": concept_id})
419428

420-
# Collect all concept IDs involved in the hierarchy
421-
concept_ids = set()
422-
for row in results:
423-
concept_ids.add(row['ancestor_concept_id'])
424-
concept_ids.add(row['descendant_concept_id'])
425-
429+
# Collect all unique concept IDs involved in the hierarchy using set comprehension
430+
concept_ids = {row['ancestor_concept_id'] for row in results} | {row['descendant_concept_id'] for row in results}
426431
# Fetch details of each concept
427432
concept_details = {}
428433
if concept_ids:
@@ -442,19 +447,17 @@ def get_concept_hierarchy(self, concept_id: int):
442447
ancestor_id = row['ancestor_concept_id']
443448
descendant_id = row['descendant_concept_id']
444449

445-
if ancestor_id not in hierarchy:
446-
hierarchy[ancestor_id] = {"details": concept_details[ancestor_id], "children": []}
447-
if descendant_id not in hierarchy:
448-
hierarchy[descendant_id] = {"details": concept_details[descendant_id], "children": []}
449-
# Link descendants to their ancestor node
450-
hierarchy[ancestor_id]["children"].append(hierarchy[descendant_id])
451-
452-
if descendant_id not in reverse_hierarchy:
453-
reverse_hierarchy[descendant_id] = {"details": concept_details[descendant_id], "parents": []}
454-
if ancestor_id not in reverse_hierarchy:
455-
reverse_hierarchy[ancestor_id] = {"details": concept_details[ancestor_id], "parents": []}
456-
# Link ancestors to their descendant (child) node
457-
reverse_hierarchy[descendant_id]["parents"].append(reverse_hierarchy[ancestor_id])
450+
ancestor_entry = hierarchy.setdefault(
451+
ancestor_id, {"details": concept_details[ancestor_id], "children": []})
452+
descendant_entry = hierarchy.setdefault(
453+
descendant_id, {"details": concept_details[descendant_id], "children": []})
454+
ancestor_entry["children"].append(descendant_entry)
455+
456+
desc_entry_rev = reverse_hierarchy.setdefault(
457+
descendant_id, {"details": concept_details[descendant_id], "parents": []})
458+
ancestor_entry_rev = reverse_hierarchy.setdefault(
459+
ancestor_id, {"details": concept_details[ancestor_id], "parents": []})
460+
desc_entry_rev["parents"].append(ancestor_entry_rev)
458461

459462
# Return the parent hierarchy and children hierarchy of the specified concept
460463
return reverse_hierarchy[concept_id], hierarchy[concept_id]

0 commit comments

Comments
 (0)