DB.py

#!/usr/bin/env python

import pymongo
from pprint import pprint
import re
import os
import sys
import math
from datetime import datetime
import ruamel.yaml
import traceback
import atexit
from random import shuffle
yaml = ruamel.yaml.YAML(typ="safe")
yaml.default_flow_style = False

def date_now():
    """
    Needed to keep the same date in python and mongo, as mongo rounds to millisecond
    """
    d = datetime.utcnow()
    return d.replace(microsecond=math.floor(d.microsecond/1000)*1000)

CONNECTION = None

def close_connection():
    global CONNECTION
    if CONNECTION is not None:
        CONNECTION.close()

atexit.register(close_connection)

def create_cgMLST_schema(name,loci):
    data_dict = {
        'name': name,
        'loci': loci,
        'length': len(loci)}
    data_dict = dump_cgMLST_schema_info(data_dict)
    return data_dict['_id']

def create_cgMLST_index(schema_id, capacity):
    schema = get_cgMLST_schema([schema_id])[0]
    schema_length = schema['length']
    order = random.shuffle(range(schema_length))
    data_dict = {
        'schema_id': schema_id,
        'order': order, # order in which loci are used in the binary tree
        'capacity': capacity, # number of 
        'index': [] # binary tree (allele_number % 2)
    }
    data_dict = dump_cgMLST_index_info(data_dict)
    return data_dict['_id']

def get_connection():
    global CONNECTION
    if CONNECTION is not None:
        return CONNECTION
    else:
        mongo_db_key_location = os.getenv("BIFROST_DB_KEY", None)
        print("location: '{}'".format(mongo_db_key_location))
        if mongo_db_key_location == '':
            print("Missing mongo db key",file=sys.stderr)
            sys.exit(1)
        with open(mongo_db_key_location, "r") as mongo_db_key_location_handle:
            mongodb_url = mongo_db_key_location_handle.readline().strip()
        # Return mongodb connection
        CONNECTION = pymongo.MongoClient(mongodb_url)
        return CONNECTION

def get_cgMLST_schema(schema_ids=None):
    """
    Return cgMLST based on query
    """
    query = []
    if schema_ids is not None:
        query.append({"_id": {"$in": schema_ids}})
    connection = get_connection()
    db = connection.get_database()
    if len(query) == 0:
        query = {}
    else:
        query = {"$and": query}
    return list(db.cgmlst_schemas.find(query).sort([("_id", pymongo.DESCENDING)]))

def dump_cgMLST_schema_info(data_dict):
    """Insert sample dict into mongodb.
    Return the dict with an _id element"""
    connection = get_connection()
    db = connection.get_database()
    cgmlst_schema_db = db.cgmlst_schemas  # Collection name is samples
    now = date_now()
    data_dict["metadata"] = data_dict.get("metadata", {})
    data_dict["metadata"]["updated_at"] = now
    if "_id" in data_dict:
        data_dict = cgmlst_schema_db.find_one_and_update(
            filter={"_id": data_dict["_id"]},
            update={"$set": data_dict},
            return_document=pymongo.ReturnDocument.AFTER,  # return new doc if one is upserted
            upsert=True  # This might change in the future # insert the document if it does not exist
        )
    else:
        data_dict["metadata"]["created_at"] = now
        result = cgmlst_schema_db.insert_one(data_dict)
        data_dict["_id"] = result.inserted_id

    return data_dict

def delete_cgMLST_schema(component_id):
    connection = get_connection()
    db = connection.get_database()
    deleted = db.cgmlst_schemas.delete_one({"_id": component_id})
    return deleted.deleted_count

def dump_cgmlst_allele_info(data_dict):
    """Insert cgMLST allele dict into mongodb.
    Return the dict with an _id element"""
    connection = get_connection()
    db = connection.get_database()
    cgmlst_db = db.cgmlst_alleles  # Collection name is cgmlst
    now = date_now()
    data_dict["metadata"] = data_dict.get("metadata", {'created_at': now})
    data_dict["metadata"]["updated_at"] = now
    if "_id" in data_dict:
        data_dict = cgmlst_db.find_one_and_update(
            filter={"_id": data_dict["_id"]},
            update={"$set": data_dict},
            return_document=pymongo.ReturnDocument.AFTER,  # return new doc if one is upserted
            upsert=True  # This might change in the future. It doesnt make much sense with our current system.
            # Import relies on this to be true.
            # insert the document if it does not exist
        )
    else:
        search_fields = {
            "sample._id": data_dict["sample"]["_id"],
            "cgmlst_schema._id": data_dict["cgmlst_schema"]["_id"],
        }
        data_dict = cgmlst_db.find_one_and_update(
            filter=search_fields,
            update={
                "$set": data_dict
            },
            return_document=pymongo.ReturnDocument.AFTER,  # return new doc if one is upserted
            upsert=True  # insert the document if it does not exist
        )
    return data_dict

def get_cgMLST_alleles(allele_ids=None):
    """
    Return cgMLST alleles based on query
    """
    query = []
    if allele_ids is not None:
        query.append({"_id": {"$in": allele_ids}})
    connection = get_connection()
    db = connection.get_database()
    if len(query) == 0:
        query = {}
    else:
        query = {"$and": query}
    return list(db.cgmlst_alleles.find(query).sort([("_id", pymongo.DESCENDING)]))

def delete_cgMLST_alleles(component_id):
    connection = get_connection()
    db = connection.get_database()
    deleted = db.cgmlst_alleles.delete_one({"_id": component_id})
    return deleted.deleted_count

def dump_cgmlst_index_info(data_dict):
    """Insert cgMLST allele dict into mongodb.
    Return the dict with an _id element"""
    connection = get_connection()
    db = connection.get_database()
    cgmlst_db = db.cgmlst_index  # Collection name is cgmlst_index
    now = date_now()
    data_dict["metadata"] = data_dict.get("metadata", {'created_at': now})
    data_dict["metadata"]["updated_at"] = now
    if "_id" in data_dict:
        data_dict = cgmlst_db.find_one_and_update(
            filter={"_id": data_dict["_id"]},
            update={"$set": data_dict},
            return_document=pymongo.ReturnDocument.AFTER,  # return new doc if one is upserted
            upsert=True  # This might change in the future. It doesnt make much sense with our current system.
            # Import relies on this to be true.
            # insert the document if it does not exist
        )
    else:
        search_fields = {
            "cgmlst_schema._id": data_dict["cgmlst_schema"]["_id"],
        }
        data_dict = cgmlst_db.find_one_and_update(
            filter=search_fields,
            update={
                "$set": data_dict
            },
            return_document=pymongo.ReturnDocument.AFTER,  # return new doc if one is upserted
            upsert=True  # insert the document if it does not exist
        )
    return data_dict

def get_cgMLST_index(index_ids=None):
    """
    Return cgMLST alleles based on query
    """
    query = []
    if index_ids is not None:
        query.append({"_id": {"$in": index_ids}})
    connection = get_connection()
    db = connection.get_database()
    if len(query) == 0:
        query = {}
    else:
        query = {"$and": query}
    return list(db.cgmlst_index.find(query).sort([("_id", pymongo.DESCENDING)]))

def delete_cgMLST_index(index_id):
    connection = get_connection()
    db = connection.get_database()
    deleted = db.cgmlst_index.delete_one({"_id": index_id})
    return deleted.deleted_count

if __name__ == "__main__":
    ## Test
    get_connection()

    data_dict = dump_cgMLST_schema_info({"test": "test"})
    pprint(data_dict)
    data_dict.update({"extra": 10})
    data_dict = dump_cgMLST_schema_info(data_dict)
    retrieved_schema = get_cgMLST_schema([data_dict["_id"]])
    pprint(retrieved_schema)
    schema_id = retrieved_schema[0]['_id']
    cgMLST_alleles = {
        'sample': {'_id': 12345},
        'cgmlst_schema': {'_id': schema_id},
        'alleles': [12, 235, 12]}
    data_dict = dump_cgmlst_allele_info(cgMLST_alleles)
    pprint(data_dict)
    alleles_id = data_dict['_id']
    cgMLST_index = {
        'cgmlst_schema': {'_id': schema_id},
        'type': "Trie",
        'order': [0, 2, 1],
        'index': {'12': {'12': {'235': [alleles_id]}}}
    }
    data_dict = dump_cgmlst_index_info(cgMLST_index)
    index_id = data_dict['_id']
    indices = get_cgMLST_index([index_id])
    pprint(indices)
    delete_cgMLST_index(index_id)
    delete_cgMLST_alleles(alleles_id)
    delete_cgMLST_index(index_id)