Implement multiple external IDs for participants and samples (#659)

* Create participant_external_id and sample_external_id database tables * Pluralise external_ids in participant and sample models * Query *_external_id tables and implement upserting into them Change these simple sample/participant table SELECTs to also join the *_external_id table into the query. Update comments of SampleLayer etc methods that call through to these to indicate that they can look up samples via this table too. In WebDb._project_summary_sample_query(), we use a hack to divert external_id queries to the appropriate peid/seid *_external_id table. Fix documentation typo in SampleTable.get_sample_by_id(). Add from_db_json() utility, the opposite of to_db_json(). Add PRIMARY_EXTERNAL_ORG constant to clarify primary org name usage. Decode JSON-encoded external_ids fields in SampleInternal.from_db() and ParticipantInternal.from_db() routines. * Use the primary external_id for GraphQLParticipant/GraphQLSample [TODO] In future we will want to represent the extra external_ids in GraphQLParticipant/GraphQLSample too. But this stopgap lets us defer that until after the tests/ directory is adjusted. * Use the primary external_id for ParsedParticipant/ParsedSample et al [TODO] In future we will want to represent the extra external_ids in ParsedParticipant/ParsedSample too. But this stopgap lets us defer that until after the tests/ directory is adjusted. [TODO] These would prefer metamist.models to export PRIMARY_EXTERNAL_ORG (or from somewhere similar) so they didn't need to define it themselves. * Adjust tests to account for s/external_id/external_ids/g test_sample.py: Add a test exercising get_sample_id_map_by_external_ids(). test_upsert.py: Update direct SQL queries to use the new tables and to ensure that their rows are returned in the expected order. * Add tests exercising multiple external_id functionality * Remove obsolete participant/sample table external_id constraints But first we need to create indexes for the correspondingly named foreign keys, which previously re-used these constraints' composite indexes. This avoids "Cannot drop index needed in a foreign key constraint" errors. * Bodge SampleLayer.get_history_of_sample() Attempting to update the history query interfered with the intended behaviour of audit_log_ids. Revert the changes to propagate audit_log_id within untouched sample_external_id rows, and revert to the previous sample-table-only get_history_of_sample() with limitations: for now, it doesn't return external_ids or audit_log_ids. * Update db/project.xml per review Make external_ids unique per project regardless of external org (name) and adjust tests accordingly. Make the new tables' audit_log_id field non-NULL, and cons up a new audit_log entry for the external_id migrations to refer to. Drop the old external_id fields' not-NULL constraint before clearing their values -- previously omitted as this was tested only on empty tables! This is in a separate changeSet as system_versioning_alter_history=1 is needed, and we'd prefer not to set that during the main changeSet. * For now, create_test_subset.py copies only the primary external_id [TODO] In future we will represent the extra external_ids in GraphQLSample too and be able to copy all of them. * Bump version: 7.0.3 → 7.1.0
populationgenomics · Jun 13, 2024 · c748513 · c748513
1 parent f177019
commit c748513
Show file tree

Hide file tree

Showing 46 changed files with 990 additions and 241 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 7.0.4
+current_version = 7.1.0
 commit = True
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>[A-z0-9-]+)

diff --git a/api/graphql/schema.py b/api/graphql/schema.py
@@ -41,6 +41,7 @@
 from db.python.utils import GenericFilter
 from models.enums import AnalysisStatus
 from models.models import (
+    PRIMARY_EXTERNAL_ORG,
     AnalysisInternal,
     AssayInternal,
     AuditLogInternal,
@@ -636,7 +637,7 @@ class GraphQLParticipant:
     def from_internal(internal: ParticipantInternal) -> 'GraphQLParticipant':
         return GraphQLParticipant(
             id=internal.id,
-            external_id=internal.external_id,
+            external_id=internal.external_ids[PRIMARY_EXTERNAL_ORG],
             meta=internal.meta,
             reported_sex=internal.reported_sex,
             reported_gender=internal.reported_gender,
@@ -725,7 +726,7 @@ class GraphQLSample:
     def from_internal(sample: SampleInternal) -> 'GraphQLSample':
         return GraphQLSample(
             id=sample_id_format(sample.id),
-            external_id=sample.external_id,
+            external_id=sample.external_ids[PRIMARY_EXTERNAL_ORG],
             active=sample.active,
             meta=sample.meta,
             type=sample.type,

diff --git a/api/server.py b/api/server.py
@@ -20,7 +20,7 @@
 from db.python.utils import get_logger
 
 # This tag is automatically updated by bump2version
-_VERSION = '7.0.4'
+_VERSION = '7.1.0'
 
 
 logger = get_logger()

diff --git a/db/project.xml b/db/project.xml
@@ -1281,4 +1281,143 @@
 		<sql>ALTER TABLE cohort_sequencing_group ADD SYSTEM VERSIONING;</sql>
 		<sql>ALTER TABLE analysis_cohort ADD SYSTEM VERSIONING;</sql>
 	</changeSet>
+	<changeSet id="2024-05-03-multiple-external-ids" author="john.marshall">
+		<createTable tableName="participant_external_id">
+			<column name="project" type="INT">
+				<constraints
+					nullable="false"
+					foreignKeyName="FK_PROJECT_PARTICIPANT_EXTERNAL_ID"
+					references="project(id)" />
+			</column>
+			<column name="participant_id" type="INT">
+				<constraints
+					nullable="false"
+					foreignKeyName="FK_PARTICIPANT_PARTICIPANT_EXTERNAL_ID"
+					references="participant(id)" />
+			</column>
+			<column name="name" type="VARCHAR(255)" />
+			<column name="external_id" type="VARCHAR(255)">
+				<constraints nullable="false" />
+			</column>
+			<column name="meta" type="LONGTEXT" />
+			<column name="audit_log_id" type="INT">
+				<constraints
+					nullable="false"
+					foreignKeyName="FK_PARTICIPANT_EXTERNAL_ID_CHANGELOG_ID"
+					references="audit_log(id)" />
+			</column>
+		</createTable>
+		<addPrimaryKey
+			tableName="participant_external_id"
+			columnNames="participant_id,name"
+			constraintName="PK_PARTICIPANT_EXTERNAL_ID"
+			validate="true"
+		/>
+		<addUniqueConstraint
+			tableName="participant_external_id"
+			columnNames="project,external_id"
+			constraintName="UK_PARTICIPANT_EXTERNAL_ID_UNIQUE_EIDS"
+			validate="true"
+		/>
+		<createIndex tableName="participant" indexName="fk_project_participant">
+			<column name="project" />
+		</createIndex>
+		<dropUniqueConstraint
+			tableName="participant"
+			constraintName="UK_PARTICIPANT_SAMPLE_EXTERNALID"
+		/>
+
+		<createTable tableName="sample_external_id">
+			<column name="project" type="INT">
+				<constraints
+					nullable="false"
+					foreignKeyName="FK_PROJECT_SAMPLE_EXTERNAL_ID"
+					references="project(id)" />
+			</column>
+			<column name="sample_id" type="INT">
+				<constraints
+					nullable="false"
+					foreignKeyName="FK_SAMPLE_SAMPLE_EXTERNAL_ID"
+					references="sample(id)" />
+			</column>
+			<column name="name" type="VARCHAR(255)" />
+			<column name="external_id" type="VARCHAR(255)">
+				<constraints nullable="false" />
+			</column>
+			<column name="meta" type="LONGTEXT" />
+			<column name="audit_log_id" type="INT">
+				<constraints
+					nullable="false"
+					foreignKeyName="FK_SAMPLE_EXTERNAL_ID_CHANGELOG_ID"
+					references="audit_log(id)" />
+			</column>
+		</createTable>
+		<addPrimaryKey
+			tableName="sample_external_id"
+			columnNames="sample_id,name"
+			constraintName="PK_SAMPLE_EXTERNAL_ID"
+			validate="true"
+		/>
+		<addUniqueConstraint
+			tableName="sample_external_id"
+			columnNames="project,external_id"
+			constraintName="UK_SAMPLE_EXTERNAL_ID_UNIQUE_EIDS"
+			validate="true"
+		/>
+		<createIndex tableName="sample" indexName="fk_project_sample">
+			<column name="project" />
+		</createIndex>
+		<dropUniqueConstraint
+			tableName="sample"
+			constraintName="UK_SAMPLE_PROJECT_EXTERNALID"
+		/>
+
+		<sql>ALTER TABLE participant_external_id ADD SYSTEM VERSIONING;</sql>
+		<sql>ALTER TABLE sample_external_id ADD SYSTEM VERSIONING;</sql>
+
+		<!-- Migrate existing external_ids to the new tables, keyed by PRIMARY_EXTERNAL_ORG, i.e. '' -->
+		<sql>INSERT INTO audit_log (author, on_behalf_of, ar_guid, comment, auth_project, meta)
+		     VALUES ('liquibase', NULL, NULL, 'participant/sample external_id migration', NULL, NULL)
+		     RETURNING @audit_log_id := id;
+
+		     INSERT INTO participant_external_id (project, participant_id, name, external_id, audit_log_id)
+		       SELECT project, id, '', external_id, @audit_log_id
+		       FROM participant;
+
+		     INSERT INTO sample_external_id (project, sample_id, name, external_id, audit_log_id)
+		       SELECT project, id, '', external_id, @audit_log_id
+		       FROM sample;
+		</sql>
+	</changeSet>
+	<changeSet id="2024-06-11-drop-old-external-id-columns" author="john.marshall">
+		<sql>SET @@system_versioning_alter_history = 1;</sql>
+
+		<dropNotNullConstraint
+			tableName="participant"
+			columnName="external_id"
+			columnDataType="VARCHAR(255)"
+		/>
+		<sql>UPDATE participant SET external_id = NULL</sql>
+		<renameColumn
+			tableName="participant"
+			oldColumnName="external_id"
+			newColumnName="_external_id_unused"
+			columnDataType="VARCHAR(255)"
+			remarks="Migration of external IDs to separate table"
+		/>
+
+		<dropNotNullConstraint
+			tableName="sample"
+			columnName="external_id"
+			columnDataType="VARCHAR(255)"
+		/>
+		<sql>UPDATE sample SET external_id = NULL</sql>
+		<renameColumn
+			tableName="sample"
+			oldColumnName="external_id"
+			newColumnName="_external_id_unused"
+			columnDataType="VARCHAR(255)"
+			remarks="Migration of external IDs to separate table"
+		/>
+	</changeSet>
 </databaseChangeLog>
diff --git a/db/python/connect.py b/db/python/connect.py
@@ -29,6 +29,8 @@
     'analysis_sequencing_group',
     'analysis_sample',
     'assay_external_id',
+    'participant_external_id',
+    'sample_external_id',
     'sequencing_group_external_id',
     'family',
     'family_participant',

diff --git a/db/python/layers/family.py b/db/python/layers/family.py
@@ -12,9 +12,9 @@
 from db.python.tables.participant import ParticipantTable
 from db.python.tables.sample import SampleTable
 from db.python.utils import GenericFilter, NotFoundError
+from models.models import PRIMARY_EXTERNAL_ORG, ProjectId
 from models.models.family import FamilyInternal, PedRow, PedRowInternal
 from models.models.participant import ParticipantUpsertInternal
-from models.models.project import ProjectId
 
 
 class FamilyLayer(BaseLayer):
@@ -299,7 +299,7 @@ async def import_pedigree(
                         continue
                     upserted_participant = await participant_table.upsert_participant(
                         ParticipantUpsertInternal(
-                            external_id=row.individual_id,
+                            external_ids={PRIMARY_EXTERNAL_ORG: row.individual_id},
                             reported_sex=row.sex,
                         )
                     )

diff --git a/db/python/layers/participant.py b/db/python/layers/participant.py
@@ -20,9 +20,9 @@
     NotFoundError,
     split_generic_terms,
 )
+from models.models import PRIMARY_EXTERNAL_ORG, ProjectId
 from models.models.family import PedRowInternal
 from models.models.participant import ParticipantInternal, ParticipantUpsertInternal
-from models.models.project import ProjectId
 
 HPO_REGEX_MATCHER = re.compile(r'HP\:\d+$')
 
@@ -250,6 +250,7 @@ async def get_participants_by_ids(
     ) -> list[ParticipantInternal]:
         """
         Get participants by IDs
+        (note that the list returned need not be ordered as per the pids argument)
         """
         projects, participants = await self.pttable.get_participants_by_ids(pids)
 
@@ -301,7 +302,10 @@ async def fill_in_missing_participants(self):
             )
         )
         external_sample_map_with_no_pid = {
-            sample.external_id: sample.id for sample in samples_with_no_pid
+            sample.external_ids[PRIMARY_EXTERNAL_ORG]: sample.id for sample in samples_with_no_pid
+        }
+        external_ids_by_primary = {
+            sample.external_ids[PRIMARY_EXTERNAL_ORG]: sample.external_ids for sample in samples_with_no_pid
         }
         ext_sample_id_to_pid = {}
 
@@ -328,7 +332,7 @@ async def fill_in_missing_participants(self):
             for external_id in external_participant_ids_to_add:
                 sample_id = external_sample_map_with_no_pid[external_id]
                 participant_id = await self.pttable.create_participant(
-                    external_id=external_id,
+                    external_ids=external_ids_by_primary[external_id],
                     reported_sex=None,
                     reported_gender=None,
                     karyotype=None,
@@ -410,7 +414,7 @@ async def generic_individual_metadata_importer(
                 )
                 for ex_pid in missing_participant_eids:
                     external_pid_map[ex_pid] = await self.pttable.create_participant(
-                        external_id=ex_pid,
+                        external_ids={PRIMARY_EXTERNAL_ORG: ex_pid},
                         reported_sex=None,
                         reported_gender=None,
                         karyotype=None,
@@ -605,7 +609,7 @@ async def upsert_participant(
                     )
                 await self.pttable.update_participant(
                     participant_id=participant.id,
-                    external_id=participant.external_id,
+                    external_ids=participant.external_ids,
                     reported_sex=participant.reported_sex,
                     reported_gender=participant.reported_gender,
                     meta=participant.meta,
@@ -614,7 +618,7 @@ async def upsert_participant(
 
             else:
                 participant.id = await self.pttable.create_participant(
-                    external_id=participant.external_id,
+                    external_ids=participant.external_ids,
                     reported_sex=participant.reported_sex,
                     reported_gender=participant.reported_gender,
                     karyotype=participant.karyotype,

diff --git a/db/python/layers/sample.py b/db/python/layers/sample.py
@@ -91,7 +91,7 @@ async def get_sample_by_id(
     async def get_single_by_external_id(
         self, external_id, project: ProjectId, check_active=True
     ) -> SampleInternal:
-        """Get a Sample by its external_id"""
+        """Get a Sample by (any of) its external_id(s)"""
         return await self.st.get_single_by_external_id(
             external_id, project, check_active=check_active
         )
@@ -102,7 +102,7 @@ async def get_sample_id_map_by_external_ids(
         project: ProjectId = None,
         allow_missing=False,
     ) -> dict[str, int]:
-        """Get map of samples {external_id: internal_id}"""
+        """Get map of samples {(any) external_id: internal_id}"""
         external_ids_set = set(external_ids)
         _project = project or self.connection.project
         assert _project
@@ -231,7 +231,7 @@ async def upsert_sample(
         async with with_function():
             if not sample.id:
                 sample.id = await self.st.insert_sample(
-                    external_id=sample.external_id,
+                    external_ids=sample.external_ids,
                     sample_type=sample.type,
                     active=True,
                     meta=sample.meta,
@@ -242,7 +242,7 @@ async def upsert_sample(
                 # Otherwise update
                 await self.st.update_sample(
                     id_=sample.id,  # type: ignore
-                    external_id=sample.external_id,
+                    external_ids=sample.external_ids,
                     meta=sample.meta,
                     participant_id=sample.participant_id,
                     type_=sample.type,

diff --git a/db/python/layers/search.py b/db/python/layers/search.py
@@ -66,7 +66,7 @@ async def _get_search_result_for_sample(
         except NotFoundError:
             return None
 
-        sample_eids = [sample.external_id]
+        sample_eids = list(sample.external_ids.values())
         participant_id = int(sample.participant_id) if sample.participant_id else None
         participant_eids: list[str] = []
         family_eids: list[str] = []

diff --git a/db/python/layers/web.py b/db/python/layers/web.py
@@ -71,6 +71,8 @@ def _project_summary_sample_query(self, grid_filter: list[SearchItem]):
                 # protect against SQL injection attacks
                 raise ValueError('Invalid characters in field')
             if not query.is_meta:
+                if field == 'external_id':
+                    prefix += 'eid'  # this field is in its own table
                 q = f'{prefix}.{field} LIKE :{key}'
             else:
                 # double double quote field to allow white space
@@ -84,8 +86,10 @@ def _project_summary_sample_query(self, grid_filter: list[SearchItem]):
         # the query to determine the total count, then take the selection of samples
         # for the current page. This is more efficient than doing 2 queries separately.
         sample_query = f"""
-        SELECT s.id, s.external_id, s.type, s.meta, s.participant_id, s.active
+        SELECT s.id, JSON_OBJECTAGG(seid.name, seid.external_id) AS external_ids,
+               s.type, s.meta, s.participant_id, s.active
         FROM sample s
+        LEFT JOIN sample_external_id seid ON s.id = seid.sample_id
         LEFT JOIN assay a ON s.id = a.sample_id
         LEFT JOIN participant p ON p.id = s.participant_id
         LEFT JOIN family_participant fp on s.participant_id = fp.participant_id
@@ -185,7 +189,7 @@ def _project_summary_process_sample_rows(
         smodels = [
             NestedSampleInternal(
                 id=s['id'],
-                external_id=s['external_id'],
+                external_ids=json.loads(s['external_ids']),
                 type=s['type'],
                 meta=json.loads(s['meta']) or {},
                 created_date=str(sample_id_start_times.get(s['id'], '')),
@@ -337,9 +341,12 @@ async def get_project_summary(
 
         # participant
         p_query = """
-            SELECT id, external_id, meta, reported_sex, reported_gender, karyotype
-            FROM participant
-            WHERE id in :pids
+            SELECT p.id, JSON_OBJECTAGG(peid.name, peid.external_id) AS external_ids,
+                   p.meta, p.reported_sex, p.reported_gender, p.karyotype
+            FROM participant p
+            INNER JOIN participant_external_id peid ON p.id = peid.participant_id
+            WHERE p.id in :pids
+            GROUP BY p.id
         """
         participant_promise = self.connection.fetch_all(p_query, {'pids': pids})
 
@@ -427,7 +434,7 @@ async def get_project_summary(
                 pmodels.append(
                     NestedParticipantInternal(
                         id=None,
-                        external_id=None,
+                        external_ids=None,
                         meta=None,
                         families=[],
                         samples=[s],
@@ -443,7 +450,7 @@ async def get_project_summary(
                 pmodels.append(
                     NestedParticipantInternal(
                         id=p['id'],
-                        external_id=p['external_id'],
+                        external_ids=json.loads(p['external_ids']),
                         meta=json.loads(p['meta']),
                         families=pid_to_families.get(p['id'], []),
                         samples=list(smodels_by_pid.get(p['id'])),