From e0e6a3dc910535aa83211cd63effb986275026e9 Mon Sep 17 00:00:00 2001
From: Mart Ratas <mart.ratas@gmail.com>
Date: Tue, 10 Oct 2023 14:04:06 +0300
Subject: [PATCH] CU-8692wgmkm: Remove py2neo dependency and the code that used
 it (#356)

* CU-8692wgmkm: Remove py2neo dependency and the code that used it

* CU-8692wgmkm: Remove medcat.neo package from setup.py
---
 medcat/neo/__init__.py         |   0
 medcat/neo/data_preparation.py | 231 ---------------------------------
 medcat/neo/neo_connector.py    | 161 -----------------------
 setup.py                       |   3 +-
 4 files changed, 1 insertion(+), 394 deletions(-)
 delete mode 100644 medcat/neo/__init__.py
 delete mode 100644 medcat/neo/data_preparation.py
 delete mode 100644 medcat/neo/neo_connector.py

diff --git a/medcat/neo/__init__.py b/medcat/neo/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/medcat/neo/data_preparation.py b/medcat/neo/data_preparation.py
deleted file mode 100644
index 551c3117e..000000000
--- a/medcat/neo/data_preparation.py
+++ /dev/null
@@ -1,231 +0,0 @@
-import os
-import pandas as pd
-
-
-def get_index_queries():
-    """Run before everything to speed up things."""
-    return ['CREATE INDEX patientId FOR (p:Patient) ON (p.patientId);',
-            'CREATE INDEX conceptId FOR (c:Concept) ON (c.conceptId);',
-            'CREATE INDEX documentId FOR (d:Document) ON (d.documentId);']
-
-
-def create_neo_csv(data, columns, output_dir='/etc/lib/neo4j/import/',
-                   base_name='patients'):
-    """Creates a patients CSV for neo4j load csv function
-
-    Args:
-        data:
-            A dataframe or path to a dataframe with the required data.
-        columns:
-            What data to use from the dataframe.
-        output_dir:
-            Where to save the CSVs, should be the neo4j imports path if possible.
-        base_name:
-            Name of the csv.
-    """
-    if isinstance(data, pd.DataFrame):
-        df = data
-    else:
-        df = pd.read_csv(data)
-
-    # Remove duplicates
-    df = df.drop_duplicates(subset=columns)
-
-    out_df = df[columns]
-    data_path = os.path.join(output_dir, f"{base_name}.csv")
-    out_df.to_csv(data_path, index=False)
-
-
-def create_patients_csv(data, output_dir='/etc/lib/neo4j/import/',
-                        base_name='patients'):
-    """Creates a patients CSV for neo4j load csv function
-
-    Args:
-        data:
-            A dataframe or path to a dataframe with the required data: patientId,
-            sex, ethnicity, dob.
-        output_dir:
-            Where to save the CSVs, should be the neo4j imports path if possible,
-            but writing there could be only admin.
-
-    Returns:
-        str: The query.
-    """
-    query = (
-        'USING PERIODIC COMMIT 100000 \n'
-        f'LOAD CSV WITH HEADERS FROM  "file:///{base_name}.csv" AS row \n'
-        'CREATE (:Patient {patientId: toString(row.patientId), \n'
-        '                  sex: toString(row.sex), \n'
-        '                  ethnicity: toString(row.ethnicity), \n'
-        '                  dob: datetime(row.dob)}) \n'
-        )
-
-    create_neo_csv(data=data, columns=['patientId', 'sex', 'ethnicity', 'dob'],
-                   output_dir=output_dir, base_name=base_name)
-
-    return query
-
-
-def create_documents_csv(data, output_dir='/etc/lib/neo4j/import/',
-                         base_name='documents'):
-    """Creates a patients CSV for neo4j load csv function
-
-    Args:
-        data:
-            A dataframe or path to a dataframe with the required data: documentId.
-        output_dir:
-            Where to save the CSVs, should be the neo4j imports path if possible.
-
-    Returns:
-        str: The query.
-    """
-    query = (
-        'USING PERIODIC COMMIT 100000 \n'
-        f'LOAD CSV WITH HEADERS FROM  "file:///{base_name}.csv" AS row \n'
-        'CREATE (:Document {documentId: toString(row.documentId)}) \n'
-        )
-
-    create_neo_csv(data=data, columns=['documentId'],
-                   output_dir=output_dir, base_name=base_name)
-
-    return query
-
-
-def create_concepts_csv(data, output_dir='/etc/lib/neo4j/import/',
-                         base_name='concepts'):
-    """Creates a patients CSV for neo4j load csv function
-
-    Args:
-        data:
-            A dataframe or path to a dataframe with the required data: conceptId,
-            name and type.
-        output_dir:
-            Where to save the CSVs, should be the neo4j imports path if possible.
-    """
-    query = (
-        'USING PERIODIC COMMIT 100000 \n'
-        f'LOAD CSV WITH HEADERS FROM  "file:///{base_name}.csv" AS row \n'
-        'CREATE (:Concept {conceptId: toString(row.conceptId), \n'
-        '                  type: toString(row.type), \n'
-        '                  name: toString(row.name)}) \n'
-        )
-
-    create_neo_csv(data=data, columns=['conceptId', 'name', 'type'],
-                   output_dir=output_dir, base_name=base_name)
-
-    return query
-
-
-def create_document2patient_csv(data, output_dir='/etc/lib/neo4j/import/',
-                                base_name='document2patient'):
-
-    """Creates a patients CSV for neo4j load csv function
-
-    Args:
-        data:
-            A dataframe or path to a dataframe with the required data: patientId and
-            documentId.
-        output_dir:
-            Where to save the CSVs, should be the neo4j imports path if possible.
-    """
-    query = (
-        'USING PERIODIC COMMIT 100000 \n'
-        f'LOAD CSV WITH HEADERS FROM  "file:///{base_name}.csv" AS row \n'
-        'MATCH (pt:Patient {patientId: toString(row.patientId)}) \n'
-        'MATCH (doc:Document {documentId: toString(row.documentId)}) \n'
-        'CREATE (pt)-[:HAS]->(doc); \n'
-        )
-
-    create_neo_csv(data=data, columns=['patientId', 'documentId'],
-                   output_dir=output_dir, base_name=base_name)
-
-    return query
-
-
-def create_concept_ontology_csv(data, output_dir='/etc/lib/neo4j/import/',
-                                base_name='concept_ontology'):
-
-    """Creates a patients CSV for neo4j load csv function
-
-    Args:
-        data:
-            A dataframe or path to a dataframe with the required data: child, parent.
-        output_dir:
-            Where to save the CSVs, should be the neo4j imports path if possible.
-    """
-    query = (
-        'USING PERIODIC COMMIT 100000 \n'
-        f'LOAD CSV WITH HEADERS FROM  "file:///{base_name}.csv" AS row \n'
-        'MATCH (child:Concept {conceptId: toString(row.child)}) \n'
-        'MATCH (parent:Concept {conceptId: toString(row.parent)}) \n'
-        'CREATE (child)-[:IS_A]->(parent); \n'
-        )
-
-    create_neo_csv(data=data, columns=['child', 'parent'],
-                   output_dir=output_dir, base_name=base_name)
-
-    return query
-
-
-def create_document2concept_csv(data, output_dir='/etc/lib/neo4j/import/',
-                         base_name='document2concepts'):
-    """Creates a patients CSV for neo4j load csv function
-
-    Args:
-        data:
-            A dataframe or path to a dataframe with the required data: 'conceptId',
-            'documentId', 'contextSimilarity', 'start', 'end', 'timestamp',
-            'metaSubject', 'metaPresence', 'metaTime'.
-        output_dir:
-            Where to save the CSVs, should be the neo4j imports path if possible.
-    """
-    query = (
-        'USING PERIODIC COMMIT 100000 \n'
-        f'LOAD CSV WITH HEADERS FROM  "file:///{base_name}.csv" AS row \n'
-        'MATCH (doc:Document{documentId: toString(row.documentId)}) \n'
-        'MATCH (concept:Concept {conceptId: toString(row.conceptId)}) \n'
-        'CREATE (doc)-[:HAS {start: toInteger(row.start), \n'
-        '                   end: toInteger(row.end), \n'
-        '                   timestamp: toInteger(row.timestamp), \n'
-        '                   contextSimilarity: toFloat(row.contextSimilarity), \n'
-        '                   metaSubject: toString(row.metaSubject), \n'
-        '                   metaPresence: toString(row.metaPresence), \n'
-        '                   metaTime: toString(row.metaTime) \n'
-        '            }]->(concept); \n'
-        )
-
-    columns = ['conceptId', 'documentId', 'contextSimilarity', 'start',
-                'end', 'timestamp', 'metaSubject', 'metaPresence', 'metaTime']
-
-    create_neo_csv(data=data, columns=columns,
-                   output_dir=output_dir, base_name=base_name)
-
-    return query
-
-
-def get_data_from_docs(docs, doc2pt, doc2time=None):
-    data = [['conceptId', 'documentId', 'contextSimilarity',
-             'start', 'end', 'timestamp', 'metaSubject',
-             'metaPresence', 'metaTime']]
-
-    for doc_id, doc in docs.items():
-        row = []
-        for ent in doc['entities'].values():
-            #if ent['meta_anns']['Subject']['value'] == 'Patient' and \
-            #   ent['meta_anns']['Presence']['value'] == 'True':
-            if doc2time is not None:
-                t = doc2time[doc_id]
-            else:
-                t = ent['document_timestamp']
-
-            row = [ent['cui'], doc_id,
-                   ent['context_similarity'],
-                   ent['start'], ent['end'],
-                   t,
-                   ent['meta_anns'].get('Subject', {}).get('value', None),
-                   ent['meta_anns'].get('Presence', {}).get('value', None),
-                   ent['meta_anns'].get('Time', {}).get('value', None)]
-            data.append(row)
-            row = []
-
-    return data
diff --git a/medcat/neo/neo_connector.py b/medcat/neo/neo_connector.py
deleted file mode 100644
index 69eef0f7e..000000000
--- a/medcat/neo/neo_connector.py
+++ /dev/null
@@ -1,161 +0,0 @@
-from py2neo import Graph
-import getpass
-from collections import defaultdict
-
-
-class NeoConnector:
-    def __init__(self, uri, user, password=None):
-        if password is None:
-            password = getpass.getpass("Password:")
-        self.graph = Graph(uri, auth=(user, password))
-
-    def execute(self, query):
-        r = self.graph.run(query)
-        return r
-
-    def bucket_concepts(self, data, bucket_size_seconds):
-        entities = data['entities']
-
-        _bucket = []
-        _concepts = set()
-        start_time = -1
-        new_stream = []
-        # Sort entities
-        entities.sort(key=lambda ent: ent['timestamp'])
-        for ent in entities:
-            if start_time == -1:
-                start_time = ent['timestamp']
-
-            if ent['timestamp'] - start_time >= bucket_size_seconds:
-                # Add to stream
-                new_stream.extend(_bucket)
-                _bucket = []
-                _concepts = set()
-                start_time = ent['timestamp']
-
-                t_ent = dict(new_stream[-1])
-                t_ent['timestamp'] += 1
-                t_ent['name'] = '<SEP>'
-                t_ent['conceptId'] = '<SEP>'
-                new_stream.append(t_ent)
-
-            if ent['conceptId'] not in _concepts:
-                _bucket.append(ent)
-                _concepts.add(ent['conceptId'])
-
-        if _bucket:
-            new_stream.extend(_bucket)
-
-        data['entities'] = new_stream
-
-    def get_all_patients(self, concepts, limit=1000, require_time=False, ignore_meta=False):
-        """Return all patients having all concepts
-
-        Args:
-            concepts: The concepts
-            limit: The maximum number of results. Defaults to 1000.
-            require_time: If set only concepts that have the timestamp property will be used.
-        """
-
-        q = "WITH [{}] AS cs ".format(",".join(["'{}'".format(c) for c in concepts]))
-        if not require_time:
-            q += '''MATCH (c:Concept)<-[:HAS '''
-            if not ignore_meta:
-                q += '''{metaPresence: 'True', metaSubject: 'Patient'}'''
-            q += ''']-(:Document)<-[:HAS]-(pt:Patient)
-            WHERE c.conceptId in cs
-            WITH pt, size(cs) as inputCnt, count(DISTINCT c) as cnt
-            WHERE cnt = inputCnt
-            '''
-        else:
-            q += '''MATCH (c:Concept)<-[r:HAS {metaPresence: 'True', metaSubject:
-            'Patient'}]-(:Document)<-[:HAS]-(pt:Patient) \n
-            WHERE c.conceptId in cs AND exists(r.timestamp) \n
-            WITH pt, size(cs) as inputCnt, count(DISTINCT c) as cnt \n
-            WHERE cnt = inputCnt \n
-            '''
-
-        q += ' RETURN pt LIMIT {}'.format(limit)
-        data = self.execute(q).data() # Do not like this too much 
-
-        return [n['pt']['patientId'] for n in data], q
-
-    def get_all_concepts_from(self, patient_id=None, document_id=None,
-            limit=1000, bucket_size_seconds=None, min_count=0, meta_requirements=None, require_time=True):
-        """Returns all concepts belonging to a document or patient
-        given the concept type (if none all are retruned).
-        """
-
-        if patient_id is not None:
-            q = 'MATCH (patient:Patient {patientId: "%s"})-[:HAS]->' % patient_id \
-                + '(document:Document)-[has:HAS]->(concept:Concept) \n'
-        elif document_id is not None:
-            q = 'MATCH (patient:Patient)-[:HAS]->(document:Document {documentId: "%s"})' % document_id \
-                + '-[has:HAS]->(concept:Concept) \n'
-        else:
-            raise Exception("patient_id or document_id are required")
-        q += 'RETURN patient, document, concept, has LIMIT %s \n' % limit
-
-        data = self.execute(q).data() # Do not like this too much 
-        out = None
-        if len(data) > 0:
-            out = {'patient': dict(data[0]['patient']),
-                   'entities': []}
-
-            cnt = defaultdict(int)
-            for row in data:
-                if meta_requirements is None or \
-                   all([row['has'][meta] == value for meta,value in meta_requirements.items()]):
-                    if not require_time or 'timestamp' in row['has']:
-                        ent = dict(row['concept']) # Take everything from concept
-                        ent['documentId'] = row['document']['documentId']
-                        ent.update(row['has']) # add all the stuff from the meta ann
-
-                        out['entities'].append(ent)
-                        cnt[ent['conceptId']] += 1
-
-            # Cleanup based on min_count
-            new_ents = []
-            for ent in out['entities']:
-                if cnt[ent['conceptId']] >= min_count:
-                    ent['count'] = cnt[ent['conceptId']]
-                    new_ents.append(ent)
-            out['entities'] = new_ents
-
-            if bucket_size_seconds is not None:
-                self.bucket_concepts(data=out, bucket_size_seconds=bucket_size_seconds)
-
-        return out, q
-
-    def get_all_patients_descend(self, concepts, limit=1000, require_time=False):
-        """Return all patients having all descendant concepts under the ancestor concept
-
-        Args:
-            concepts: Ancestor top-level concepts
-            limit: The maximum number of results. Defaults to 1000.
-            require_time: If set only concepts that have the timestamp property will be used.
-                Defaults to False
-        Returns:
-            List: Patients with attached SNOMED concepts
-        """
-
-        q = "WITH [{}] AS ancestor ".format(",".join(["'{}'".format(c) for c in concepts]))
-        if not require_time:
-            q += '''MATCH (n:Concept)-[:IS_A*0..5]->(m:Concept)
-                    WHERE m.conceptId IN ancestor ## get the ancestor and the children
-                    WITH [n.conceptId] AS lineage ## pass the lineage to patient match
-                    MATCH (c:Concept)<-[r:HAS {metaPresence: 'True', metaSubject: 'Patient'}]-(d:Document)<-[q:HAS]-(pt:Patient)
-                    WHERE c.conceptId in lineage    
-                    '''
-        else:
-            q += '''MATCH (n:Concept)-[:IS_A*0..5]->(m:Concept)
-                    WHERE m.conceptId IN ancestor ## get the ancestor and the children
-                    WITH [n.conceptId] AS lineage ## pass the lineage to patient match
-                    MATCH (c:Concept)<-[r:HAS {metaPresence: 'True', metaSubject: 'Patient'}]-(d:Document)<-[q:HAS]-(pt:Patient)
-                    WHERE c.conceptId in lineage AND exists(r.timestamp)
-                    '''
-
-        q += ' RETURN pt.patientId, pt.sex, c.conceptId, c.name, r.timestamp LIMIT {}'.format(limit)
-        data = self.execute(q).data() # Do not like this too much 
-
-        return [n['pt']['patientId'] for n in data], q
diff --git a/setup.py b/setup.py
index 4e73b2f89..68d68fb43 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
     long_description_content_type="text/markdown",
     url="https://github.com/CogStack/MedCAT",
     packages=['medcat', 'medcat.utils', 'medcat.preprocessing', 'medcat.ner', 'medcat.linking', 'medcat.datasets',
-              'medcat.tokenizers', 'medcat.utils.meta_cat', 'medcat.pipeline', 'medcat.neo', 'medcat.utils.ner',
+              'medcat.tokenizers', 'medcat.utils.meta_cat', 'medcat.pipeline', 'medcat.utils.ner',
               'medcat.utils.saving', 'medcat.utils.regression'],
     install_requires=[
         'numpy>=1.22.0', # first to support 3.11
@@ -33,7 +33,6 @@
         'psutil>=5.8.0',
         # 0.70.12 uses older version of dill (i.e less than 0.3.5) which is required for datasets
         'multiprocess~=0.70.12',  # 0.70.14 seemed to work just fine
-        'py2neo~=2021.2.3',
         'aiofiles>=0.8.0', # allow later versions, tested with 22.1.0
         'ipywidgets>=7.6.5', # allow later versions, tested with 0.8.0
         'xxhash>=3.0.0', # allow later versions, tested with 3.1.0