diff --git a/hawc/apps/common/exports.py b/hawc/apps/common/exports.py index 6693a06ad2..ac257e7fe2 100644 --- a/hawc/apps/common/exports.py +++ b/hawc/apps/common/exports.py @@ -1,4 +1,5 @@ import pandas as pd +from django.conf import settings from django.db.models import QuerySet from .helper import FlatExport @@ -11,21 +12,13 @@ def __init__( self, key_prefix: str = "", query_prefix: str = "", - include: tuple[str, ...] | None = None, - exclude: tuple[str, ...] | None = None, + include: tuple | None = None, + exclude: tuple | None = None, ): - """Instantiate an exporter instance for a given django model. - - Args: - key_prefix (str, optional): The model name to prepend to data frame columns. - query_prefix (str, optional): The model prefix in the ORM. - include (tuple | None, optional): If included, only these items are added. - exclude (tuple | None, optional): If specified, items are removed from base. - """ self.key_prefix = key_prefix + "-" if key_prefix else key_prefix self.query_prefix = query_prefix + "__" if query_prefix else query_prefix - self.include = (key_prefix + field for field in include) if include else tuple() - self.exclude = (key_prefix + field for field in exclude) if exclude else tuple() + self.include = tuple(self.key_prefix + field for field in include) if include else tuple() + self.exclude = tuple(self.key_prefix + field for field in exclude) if exclude else tuple() @property def value_map(self) -> dict: @@ -153,6 +146,14 @@ def prepare_df(self, df: pd.DataFrame) -> pd.DataFrame: """ return df + def format_time(self, df: pd.DataFrame) -> pd.DataFrame: + for key in [self.get_column_name("created"), self.get_column_name("last_updated")]: + if key in df.columns: + df.loc[:, key] = df[key].apply( + lambda x: x.tz_convert(settings.TIME_ZONE).isoformat() + ) + return df + def get_df(self, qs: QuerySet) -> pd.DataFrame: """Get dataframe export from queryset. @@ -211,7 +212,6 @@ def get_df(self, qs: QuerySet) -> pd.DataFrame: @classmethod def flat_export(cls, qs: QuerySet, filename: str) -> FlatExport: """Return an instance of a FlatExport. - Args: qs (QuerySet): the initial QuerySet filename (str): the filename for the export diff --git a/hawc/apps/epi/exports.py b/hawc/apps/epi/exports.py index befea2868c..4656266a41 100644 --- a/hawc/apps/epi/exports.py +++ b/hawc/apps/epi/exports.py @@ -1,266 +1,636 @@ +import math + +import pandas as pd +from django.db.models import Case, Q, When + +from ..common.exports import Exporter, ModelExport from ..common.helper import FlatFileExporter +from ..common.models import sql_display, sql_format, str_m2m from ..materialized.models import FinalRiskOfBiasScore -from ..study.models import Study -from . import models +from ..study.exports import StudyExport +from . import constants, models -class OutcomeComplete(FlatFileExporter): - def _get_header_row(self): - header = [] - header.extend(Study.flat_complete_header_row()) - header.extend(models.StudyPopulation.flat_complete_header_row()) - header.extend(models.Outcome.flat_complete_header_row()) - header.extend(models.Exposure.flat_complete_header_row()) - header.extend(models.ComparisonSet.flat_complete_header_row()) - header.extend(models.Result.flat_complete_header_row()) - header.extend(models.Group.flat_complete_header_row()) - header.extend(models.GroupResult.flat_complete_header_row()) - return header - - def _get_data_rows(self): - rows = [] - identifiers_df = Study.identifiers_df(self.queryset, "study_population__study_id") - for obj in self.queryset: - ser = obj.get_json(json_encode=False) - row = [] - row.extend( - Study.flat_complete_data_row(ser["study_population"]["study"], identifiers_df) +def percent_control(n_1, mu_1, sd_1, n_2, mu_2, sd_2): + mean = low = high = None + + if mu_1 and mu_2 and mu_1 != 0: + mean = (mu_2 - mu_1) / mu_1 * 100.0 + if sd_1 and sd_2 and n_1 and n_2: + sd = math.sqrt( + pow(mu_1, -2) + * ((pow(sd_2, 2) / n_2) + (pow(mu_2, 2) * pow(sd_1, 2)) / (n_1 * pow(mu_1, 2))) ) - row.extend(models.StudyPopulation.flat_complete_data_row(ser["study_population"])) - row.extend(models.Outcome.flat_complete_data_row(ser)) - for res in ser["results"]: - row_copy = list(row) - row_copy.extend( - models.Exposure.flat_complete_data_row(res["comparison_set"]["exposure"]) - ) - row_copy.extend(models.ComparisonSet.flat_complete_data_row(res["comparison_set"])) - row_copy.extend(models.Result.flat_complete_data_row(res)) - for rg in res["results"]: - row_copy2 = list(row_copy) - row_copy2.extend(models.Group.flat_complete_data_row(rg["group"])) - row_copy2.extend(models.GroupResult.flat_complete_data_row(rg)) - rows.append(row_copy2) - return rows + ci = (1.96 * sd) * 100 + rng = sorted([mean - ci, mean + ci]) + low = rng[0] + high = rng[1] + + return mean, low, high + + +class StudyPopulationExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "url": "url", + "name": "name", + "design": "design_display", + "age_profile": "age_profile", + "source": "source", + "countries": "countries__name", + "region": "region", + "state": "state", + "eligible_n": "eligible_n", + "invited_n": "invited_n", + "participant_n": "participant_n", + "inclusion_criteria": "inclusion_criteria", + "exclusion_criteria": "exclusion_criteria", + "confounding_criteria": "confounding_criteria", + "comments": "comments", + "created": "created", + "last_updated": "last_updated", + } + + def get_annotation_map(self, query_prefix): + return { + "url": sql_format("/epi/study-population/{}/", query_prefix + "id"), # hardcoded URL + "design_display": sql_display(query_prefix + "design", constants.Design), + "countries__name": str_m2m(query_prefix + "countries__name"), + "inclusion_criteria": str_m2m( + query_prefix + "spcriteria__criteria__description", + filter=Q(**{query_prefix + "spcriteria__criteria_type": constants.CriteriaType.I}), + ), + "exclusion_criteria": str_m2m( + query_prefix + "spcriteria__criteria__description", + filter=Q(**{query_prefix + "spcriteria__criteria_type": constants.CriteriaType.E}), + ), + "confounding_criteria": str_m2m( + query_prefix + "spcriteria__criteria__description", + filter=Q(**{query_prefix + "spcriteria__criteria_type": constants.CriteriaType.C}), + ), + } + + def prepare_df(self, df): + return self.format_time(df) + + +class OutcomeExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "url": "url", + "name": "name", + "effects": "effects__name", + "system": "system", + "effect": "effect", + "effect_subtype": "effect_subtype", + "diagnostic": "diagnostic_display", + "diagnostic_description": "diagnostic_description", + "age_of_measurement": "age_of_measurement", + "outcome_n": "outcome_n", + "summary": "summary", + "created": "created", + "last_updated": "last_updated", + } + + def get_annotation_map(self, query_prefix): + return { + "url": sql_format("/epi/outcome/{}/", query_prefix + "id"), # hardcoded URL + "effects__name": str_m2m(query_prefix + "effects__name"), + "diagnostic_display": sql_display(query_prefix + "diagnostic", constants.Diagnostic), + } + + def prepare_df(self, df): + return self.format_time(df) + + +class ExposureExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "url": "url", + "name": "name", + "inhalation": "inhalation", + "dermal": "dermal", + "oral": "oral", + "in_utero": "in_utero", + "iv": "iv", + "unknown_route": "unknown_route", + "measured": "measured", + "metric": "metric", + "metric_units_id": "metric_units__id", + "metric_units_name": "metric_units__name", + "metric_description": "metric_description", + "analytical_method": "analytical_method", + "sampling_period": "sampling_period", + "age_of_exposure": "age_of_exposure", + "duration": "duration", + "n": "n", + "exposure_distribution": "exposure_distribution", + "description": "description", + "created": "created", + "last_updated": "last_updated", + } + + def get_annotation_map(self, query_prefix): + return { + "url": sql_format("/epi/exposure/{}/", query_prefix + "id"), # hardcoded URL + } + + def prepare_df(self, df): + return self.format_time(df) + + +class ComparisonSetExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "url": "url", + "name": "name", + "description": "description", + "created": "created", + "last_updated": "last_updated", + } + + def get_annotation_map(self, query_prefix): + return { + "url": sql_format("/epi/comparison-set/{}/", query_prefix + "id"), # hardcoded URL + } + + def prepare_df(self, df): + return self.format_time(df) + + +class ResultMetricExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "name": "metric", + "abbreviation": "abbreviation", + } + + +class ResultExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "name": "name", + "metric_description": "metric_description", + "metric_units": "metric_units", + "data_location": "data_location", + "population_description": "population_description", + "dose_response": "dose_response_display", + "dose_response_details": "dose_response_details", + "prevalence_incidence": "prevalence_incidence", + "statistical_power": "statistical_power_display", + "statistical_power_details": "statistical_power_details", + "statistical_test_results": "statistical_test_results", + "trend_test": "trend_test", + "adjustment_factors": "adjustment_factors", + "adjustment_factors_considered": "adjustment_factors_considered", + "estimate_type": "estimate_type_display", + "variance_type": "variance_type_display", + "ci_units": "ci_units", + "comments": "comments", + "created": "created", + "last_updated": "last_updated", + "tags": "tags", + } + + def get_annotation_map(self, query_prefix): + return { + "dose_response_display": sql_display( + query_prefix + "dose_response", constants.DoseResponse + ), + "adjustment_factors": str_m2m( + query_prefix + "resfactors__adjustment_factor__description", + filter=Q(**{query_prefix + "resfactors__included_in_final_model": True}), + ), + "adjustment_factors_considered": str_m2m( + query_prefix + "resfactors__adjustment_factor__description", + filter=Q(**{query_prefix + "resfactors__included_in_final_model": False}), + ), + "statistical_power_display": sql_display( + query_prefix + "statistical_power", constants.StatisticalPower + ), + "estimate_type_display": sql_display( + query_prefix + "estimate_type", constants.EstimateType + ), + "variance_type_display": sql_display( + query_prefix + "variance_type", constants.VarianceType + ), + "tags": str_m2m(query_prefix + "resulttags__name"), + } + + def prepare_df(self, df): + return self.format_time(df) + + +class GroupExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "group_id": "group_id", + "name": "name", + "numeric": "numeric", + "comparative_name": "comparative_name", + "sex": "sex_display", + "ethnicities": "ethnicities", + "eligible_n": "eligible_n", + "invited_n": "invited_n", + "participant_n": "participant_n", + "isControl": "isControl", + "comments": "comments", + "created": "created", + "last_updated": "last_updated", + } + + def get_annotation_map(self, query_prefix): + return { + "sex_display": sql_display(query_prefix + "sex", constants.Sex), + "ethnicities": str_m2m(query_prefix + "ethnicities__name"), + } + + def prepare_df(self, df): + return self.format_time(df) + + +class GroupResultExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "n": "n", + "estimate": "estimate", + "variance": "variance", + "lower_ci": "lower_ci", + "upper_ci": "upper_ci", + "lower_range": "lower_range", + "upper_range": "upper_range", + "lower_bound_interval": "lower_bound_interval", + "upper_bound_interval": "upper_bound_interval", + "p_value_qualifier": "p_value_qualifier_display", + "p_value": "p_value", + "is_main_finding": "is_main_finding", + "main_finding_support": "main_finding_support_display", + "created": "created", + "last_updated": "last_updated", + } + + def get_annotation_map(self, query_prefix): + return { + "lower_bound_interval": Case( + When(**{query_prefix + "lower_ci": None}, then=query_prefix + "lower_range"), + default=query_prefix + "lower_ci", + ), + "upper_bound_interval": Case( + When(**{query_prefix + "upper_ci": None}, then=query_prefix + "upper_range"), + default=query_prefix + "upper_ci", + ), + "p_value_qualifier_display": sql_display( + query_prefix + "p_value_qualifier", constants.PValueQualifier + ), + "main_finding_support_display": sql_display( + query_prefix + "main_finding_support", constants.MainFinding + ), + } + + def prepare_df(self, df): + return self.format_time(df) + + +class CentralTendencyExport(ModelExport): + def get_value_map(self): + return { + "estimate": "estimate", + "estimate_type": "estimate_type_display", + "variance": "variance", + "variance_type": "variance_type_display", + "lower_bound_interval": "lower_bound_interval", + "upper_bound_interval": "upper_bound_interval", + "lower_ci": "lower_ci", + "upper_ci": "upper_ci", + "lower_range": "lower_range", + "upper_range": "upper_range", + } + + def get_annotation_map(self, query_prefix): + return { + "estimate_type_display": sql_display( + query_prefix + "estimate_type", constants.EstimateType + ), + "variance_type_display": sql_display( + query_prefix + "variance_type", constants.VarianceType + ), + "lower_bound_interval": Case( + When(**{query_prefix + "lower_ci": None}, then=query_prefix + "lower_range"), + default=query_prefix + "lower_ci", + ), + "upper_bound_interval": Case( + When(**{query_prefix + "upper_ci": None}, then=query_prefix + "upper_range"), + default=query_prefix + "upper_ci", + ), + } + + +class EpiExporter(Exporter): + def build_modules(self) -> list[ModelExport]: + return [ + StudyExport("study", "study_population__study"), + StudyPopulationExport("sp", "study_population"), + OutcomeExport("outcome", ""), + ExposureExport("exposure", "results__comparison_set__exposure"), + ComparisonSetExport("cs", "results__comparison_set"), + ResultMetricExport("metric", "results__metric"), + ResultExport("result", "results", exclude=("tags",)), + GroupExport("group", "results__results__group"), + GroupResultExport("result_group", "results__results"), + ] + + +class OutcomeComplete(FlatFileExporter): + """ + Returns a complete export of all data required to rebuild the the + epidemiological meta-result study type from scratch. + """ + + def build_df(self) -> pd.DataFrame: + return EpiExporter().get_df(self.queryset) + + +class EpiDataPivotExporter(Exporter): + def build_modules(self) -> list[ModelExport]: + return [ + StudyExport( + "study", + "study_population__study", + include=("id", "short_citation", "study_identifier", "published"), + ), + StudyPopulationExport( + "sp", "study_population", include=("id", "name", "age_profile", "source", "design") + ), + OutcomeExport( + "outcome", + "", + include=( + "id", + "name", + "system", + "effect", + "effect_subtype", + "diagnostic", + "age_of_measurement", + "effects", + ), + ), + ComparisonSetExport("cs", "results__comparison_set", include=("id", "name")), + ExposureExport( + "exposure", + "results__comparison_set__exposure", + include=( + "id", + "name", + "metric", + "measured", + "metric_units_name", + "age_of_exposure", + ), + ), + CentralTendencyExport( + "ct", + "results__comparison_set__exposure__central_tendencies", + include=( + "estimate", + "estimate_type", + "variance", + "variance_type", + "lower_bound_interval", + "upper_bound_interval", + "lower_ci", + "upper_ci", + "lower_range", + "upper_range", + ), + ), + ResultExport( + "result", + "results", + include=( + "id", + "name", + "population_description", + "tags", + "metric_description", + "comments", + "dose_response", + "statistical_power", + "statistical_test_results", + "ci_units", + "estimate_type", + "variance_type", + ), + ), + ResultMetricExport("metric", "results__metric", include=("name", "abbreviation")), + GroupExport( + "group", + "results__results__group", + include=("group_id", "name", "comparative_name", "numeric", "isControl"), + ), + GroupResultExport( + "result_group", + "results__results", + include=( + "id", + "n", + "estimate", + "lower_ci", + "upper_ci", + "lower_range", + "upper_range", + "lower_bound_interval", + "upper_bound_interval", + "variance", + "p_value", + "p_value_qualifier", + "is_main_finding", + "main_finding_support", + ), + ), + ] class OutcomeDataPivot(FlatFileExporter): - def _get_header_row(self): - if self.queryset.first() is None: - self.rob_headers, self.rob_data = {}, {} - else: - outcome_ids = set(self.queryset.values_list("id", flat=True)) - self.rob_headers, self.rob_data = FinalRiskOfBiasScore.get_dp_export( - self.queryset.first().assessment_id, - outcome_ids, - "epi", + def _add_percent_control(self, df: pd.DataFrame) -> pd.DataFrame: + def _get_stdev(x: pd.Series): + return models.GroupResult.stdev( + x["result-variance_type"], x["result_group-variance"], x["result_group-n"] ) - headers = [ - "study id", - "study name", - "study identifier", - "study published", - "study population id", - "study population name", - "study population age profile", - "study population source", - "design", - "outcome id", - "outcome name", - "outcome system", - "outcome effect", - "outcome effect subtype", - "diagnostic", - "age of outcome measurement", - "tags", - ] + def _apply_results(_df1: pd.DataFrame): + controls = _df1.loc[_df1["group-isControl"] == True] # noqa: E712 + control = _df1.iloc[0] if controls.empty else controls.iloc[0] + n_1 = control["result_group-n"] + mu_1 = control["result_group-estimate"] + sd_1 = _get_stdev(control) + + def _apply_result_groups(_df2: pd.DataFrame): + row = _df2.iloc[0] + if control["result-estimate_type"] in ["median", "mean"] and control[ + "result-variance_type" + ] in [ + "SD", + "SE", + "SEM", + ]: + n_2 = row["result_group-n"] + mu_2 = row["result_group-estimate"] + sd_2 = _get_stdev(row) + mean, low, high = percent_control(n_1, mu_1, sd_1, n_2, mu_2, sd_2) + return pd.DataFrame( + [[mean, low, high]], + columns=[ + "percent control mean", + "percent control low", + "percent control high", + ], + index=[row["result_group-id"]], + ) + return pd.DataFrame( + [], + columns=[ + "percent control mean", + "percent control low", + "percent control high", + ], + ) + + rgs = _df1.groupby("result_group-id", group_keys=False) + return rgs.apply(_apply_result_groups) - headers.extend(list(self.rob_headers.values())) - - headers.extend( - [ - "comparison set id", - "comparison set name", - "exposure id", - "exposure name", - "exposure metric", - "exposure measured", - "dose units", - "age of exposure", - "exposure estimate", - "exposure estimate type", - "exposure variance", - "exposure variance type", - "exposure lower bound interval", - "exposure upper bound interval", - "exposure lower ci", - "exposure upper ci", - "exposure lower range", - "exposure upper range", - "result id", - "result name", - "result population description", - "result tags", - "statistical metric", - "statistical metric abbreviation", - "statistical metric description", - "result summary", - "dose response", - "statistical power", - "statistical test results", - "CI units", - "exposure group order", - "exposure group name", - "exposure group comparison name", - "exposure group numeric", - "Reference/Exposure group", - "Result, summary numerical", - "key", - "result group id", - "N", - "estimate", - "lower CI", - "upper CI", - "lower range", - "upper range", - "lower bound interval", - "upper bound interval", - "variance", - "statistical significance", - "statistical significance (numeric)", - "main finding", - "main finding support", - "percent control mean", - "percent control low", - "percent control high", - ] + results = df.groupby("result-id", group_keys=False) + computed_df = results.apply(_apply_results) + return df.join(computed_df, on="result_group-id").drop( + columns=["result-estimate_type", "result-variance_type", "group-isControl"] ) - return headers - - def _get_data_rows(self): - rows = [] - for obj in self.queryset: - ser = obj.get_json(json_encode=False) - row = [ - ser["study_population"]["study"]["id"], - ser["study_population"]["study"]["short_citation"], - ser["study_population"]["study"]["study_identifier"], - ser["study_population"]["study"]["published"], - ser["study_population"]["id"], - ser["study_population"]["name"], - ser["study_population"]["age_profile"], - ser["study_population"]["source"], - ser["study_population"]["design"], - ser["id"], - ser["name"], - ser["system"], - ser["effect"], - ser["effect_subtype"], - ser["diagnostic"], - ser["age_of_measurement"], - self.get_flattened_tags(ser, "effects"), - ] - outcome_robs = [ - self.rob_data[(ser["id"], metric_id)] for metric_id in self.rob_headers.keys() - ] - row.extend(outcome_robs) - - for res in ser["results"]: - row_copy = list(row) - - # comparison set - row_copy.extend([res["comparison_set"]["id"], res["comparison_set"]["name"]]) - - # exposure (may be missing) - if res["comparison_set"]["exposure"]: - row_copy.extend( - [ - res["comparison_set"]["exposure"]["id"], - res["comparison_set"]["exposure"]["name"], - res["comparison_set"]["exposure"]["metric"], - res["comparison_set"]["exposure"]["measured"], - res["comparison_set"]["exposure"]["metric_units"]["name"], - res["comparison_set"]["exposure"]["age_of_exposure"], - ] - ) + def build_df(self) -> pd.DataFrame: + df = EpiDataPivotExporter().get_df(self.queryset.order_by("id", "results__results")) + outcome_ids = list(df["outcome-id"].unique()) + rob_headers, rob_data = FinalRiskOfBiasScore.get_dp_export( + self.queryset.first().assessment_id, + outcome_ids, + "epi", + ) + rob_df = pd.DataFrame( + data=[ + [rob_data[(outcome_id, metric_id)] for metric_id in rob_headers.keys()] + for outcome_id in outcome_ids + ], + columns=list(rob_headers.values()), + index=outcome_ids, + ) + df = df.join(rob_df, on="outcome-id") - num_rows_for_ct = len(res["comparison_set"]["exposure"]["central_tendencies"]) - if num_rows_for_ct == 0: - row_copy.extend(["-"] * 10) - self.addOutcomesAndGroupsToRowAndAppend(rows, res, ser, row_copy) - else: - for ct in res["comparison_set"]["exposure"]["central_tendencies"]: - row_copy_ct = list(row_copy) - row_copy_ct.extend( - [ - ct["estimate"], - ct["estimate_type"], - ct["variance"], - ct["variance_type"], - ct["lower_bound_interval"], - ct["upper_bound_interval"], - ct["lower_ci"], - ct["upper_ci"], - ct["lower_range"], - ct["upper_range"], - ] - ) - self.addOutcomesAndGroupsToRowAndAppend(rows, res, ser, row_copy_ct) - - else: - row_copy.extend(["-"] * (6 + 10)) # exposure + exposure.central_tendencies - self.addOutcomesAndGroupsToRowAndAppend(rows, res, ser, row_copy) - - return rows - - def addOutcomesAndGroupsToRowAndAppend(self, rows, res, ser, row): - # outcome details - row.extend( - [ - res["id"], - res["name"], - res["population_description"], - self.get_flattened_tags(res, "resulttags"), - res["metric"]["metric"], - res["metric"]["abbreviation"], - res["metric_description"], - res["comments"], - res["dose_response"], - res["statistical_power"], - res["statistical_test_results"], - res["ci_units"], - ] + df["Reference/Exposure group"] = ( + df["study-short_citation"] + + " (" + + df["group-name"] + + ", n=" + + df["result_group-n"].astype(str) + + ")" + ) + df["Result, summary numerical"] = ( + df["result_group-estimate"].astype(str) + + " (" + + df["result_group-lower_ci"].astype(str) + + " - " + + df["result_group-upper_ci"].astype(str) + + ")" ) + df["key"] = df["result_group-id"] + df["statistical significance"] = df.apply( + lambda x: x["result_group-p_value_qualifier"] + if pd.isna(x["result_group-p_value"]) + else f"{x['result_group-p_value']:g}" + if x["result_group-p_value_qualifier"] in ["=", "-", "n.s."] + else f"{x['result_group-p_value_qualifier']}{x['result_group-p_value']:g}", + axis="columns", + ) + df = df.drop(columns="result_group-p_value_qualifier") - for rg in res["results"]: - row_copy = list(row) - row_copy.extend( - [ - rg["group"]["group_id"], - rg["group"]["name"], - rg["group"]["comparative_name"], - rg["group"]["numeric"], - f'{ser["study_population"]["study"]["short_citation"]} ({rg["group"]["name"]}, n={rg["n"]})', - f'{rg["estimate"]} ({rg["lower_ci"]} - {rg["upper_ci"]})', - rg["id"], - rg["id"], # repeat for data-pivot key - rg["n"], - rg["estimate"], - rg["lower_ci"], - rg["upper_ci"], - rg["lower_range"], - rg["upper_range"], - rg["lower_bound_interval"], - rg["upper_bound_interval"], - rg["variance"], - rg["p_value_text"], - rg["p_value"], - rg["is_main_finding"], - rg["main_finding_support"], - rg["percentControlMean"], - rg["percentControlLow"], - rg["percentControlHigh"], - ] - ) - rows.append(row_copy) + df = self._add_percent_control(df) + + df = df.rename( + columns={ + "study-id": "study id", + "study-short_citation": "study name", + "study-study_identifier": "study identifier", + "study-published": "study published", + "sp-id": "study population id", + "sp-name": "study population name", + "sp-age_profile": "study population age profile", + "sp-source": "study population source", + "sp-design": "design", + "outcome-id": "outcome id", + "outcome-name": "outcome name", + "outcome-system": "outcome system", + "outcome-effect": "outcome effect", + "outcome-effect_subtype": "outcome effect subtype", + "outcome-diagnostic": "diagnostic", + "outcome-age_of_measurement": "age of outcome measurement", + "outcome-effects": "tags", + } + ) + df = df.rename( + columns={ + "cs-id": "comparison set id", + "cs-name": "comparison set name", + "exposure-id": "exposure id", + "exposure-name": "exposure name", + "exposure-metric": "exposure metric", + "exposure-measured": "exposure measured", + "exposure-metric_units_name": "dose units", + "exposure-age_of_exposure": "age of exposure", + "ct-estimate": "exposure estimate", + "ct-estimate_type": "exposure estimate type", + "ct-variance": "exposure variance", + "ct-variance_type": "exposure variance type", + "ct-lower_bound_interval": "exposure lower bound interval", + "ct-upper_bound_interval": "exposure upper bound interval", + "ct-lower_ci": "exposure lower ci", + "ct-upper_ci": "exposure upper ci", + "ct-lower_range": "exposure lower range", + "ct-upper_range": "exposure upper range", + "result-id": "result id", + "result-name": "result name", + "result-population_description": "result population description", + "result-tags": "result tags", + "metric-name": "statistical metric", + "metric-abbreviation": "statistical metric abbreviation", + "result-metric_description": "statistical metric description", + "result-comments": "result summary", + "result-dose_response": "dose response", + "result-statistical_power": "statistical power", + "result-statistical_test_results": "statistical test results", + "result-ci_units": "CI units", + "group-group_id": "exposure group order", + "group-name": "exposure group name", + "group-comparative_name": "exposure group comparison name", + "group-numeric": "exposure group numeric", + "result_group-id": "result group id", + "result_group-n": "N", + "result_group-estimate": "estimate", + "result_group-lower_ci": "lower CI", + "result_group-upper_ci": "upper CI", + "result_group-lower_range": "lower range", + "result_group-upper_range": "upper range", + "result_group-lower_bound_interval": "lower bound interval", + "result_group-upper_bound_interval": "upper bound interval", + "result_group-variance": "variance", + "result_group-p_value": "statistical significance (numeric)", + "result_group-is_main_finding": "main finding", + "result_group-main_finding_support": "main finding support", + } + ) + + return df diff --git a/hawc/apps/epi/models.py b/hawc/apps/epi/models.py index b7eab9f6ac..e73b0549b0 100644 --- a/hawc/apps/epi/models.py +++ b/hawc/apps/epi/models.py @@ -206,57 +206,6 @@ class StudyPopulation(models.Model): BREADCRUMB_PARENT = "study" - @staticmethod - def flat_complete_header_row(): - return ( - "sp-id", - "sp-url", - "sp-name", - "sp-design", - "sp-age_profile", - "sp-source", - "sp-countries", - "sp-region", - "sp-state", - "sp-eligible_n", - "sp-invited_n", - "sp-participant_n", - "sp-inclusion_criteria", - "sp-exclusion_criteria", - "sp-confounding_criteria", - "sp-comments", - "sp-created", - "sp-last_updated", - ) - - @staticmethod - def flat_complete_data_row(ser): - def getCriteriaList(lst, filt): - return "|".join( - [d["description"] for d in [d for d in lst if d["criteria_type"] == filt]] - ) - - return ( - ser["id"], - ser["url"], - ser["name"], - ser["design"], - ser["age_profile"], - ser["source"], - "|".join([c["name"] for c in ser["countries"]]), - ser["region"], - ser["state"], - ser["eligible_n"], - ser["invited_n"], - ser["participant_n"], - getCriteriaList(ser["criteria"], "Inclusion"), - getCriteriaList(ser["criteria"], "Exclusion"), - getCriteriaList(ser["criteria"], "Confounding"), - ser["comments"], - ser["created"], - ser["last_updated"], - ) - class Meta: ordering = ("name",) @@ -379,44 +328,6 @@ def get_absolute_url(self): def can_create_sets(self): return not self.study_population.can_create_sets() - @staticmethod - def flat_complete_header_row(): - return ( - "outcome-id", - "outcome-url", - "outcome-name", - "outcome-effects", - "outcome-system", - "outcome-effect", - "outcome-effect_subtype", - "outcome-diagnostic", - "outcome-diagnostic_description", - "outcome-age_of_measurement", - "outcome-outcome_n", - "outcome-summary", - "outcome-created", - "outcome-last_updated", - ) - - @staticmethod - def flat_complete_data_row(ser): - return ( - ser["id"], - ser["url"], - ser["name"], - "|".join([str(d["name"]) for d in ser["effects"]]), - ser["system"], - ser["effect"], - ser["effect_subtype"], - ser["diagnostic"], - ser["diagnostic_description"], - ser["age_of_measurement"], - ser["outcome_n"], - ser["summary"], - ser["created"], - ser["last_updated"], - ) - def get_study(self): return self.study_population.get_study() @@ -486,28 +397,6 @@ def get_assessment(self): def __str__(self): return self.name - @staticmethod - def flat_complete_header_row(): - return ( - "cs-id", - "cs-url", - "cs-name", - "cs-description", - "cs-created", - "cs-last_updated", - ) - - @staticmethod - def flat_complete_data_row(ser): - return ( - ser["id"], - ser["url"], - ser["name"], - ser["description"], - ser["created"], - ser["last_updated"], - ) - def get_study(self): if self.study_population: return self.study_population.get_study() @@ -590,44 +479,6 @@ def get_assessment(self): def __str__(self): return self.name - @staticmethod - def flat_complete_header_row(): - return ( - "group-id", - "group-group_id", - "group-name", - "group-numeric", - "group-comparative_name", - "group-sex", - "group-ethnicities", - "group-eligible_n", - "group-invited_n", - "group-participant_n", - "group-isControl", - "group-comments", - "group-created", - "group-last_updated", - ) - - @staticmethod - def flat_complete_data_row(ser): - return ( - ser["id"], - ser["group_id"], - ser["name"], - ser["numeric"], - ser["comparative_name"], - ser["sex"], - "|".join([d["name"] for d in ser["ethnicities"]]), - ser["eligible_n"], - ser["invited_n"], - ser["participant_n"], - ser["isControl"], - ser["comments"], - ser["created"], - ser["last_updated"], - ) - class Exposure(models.Model): objects = managers.ExposureManager() @@ -771,65 +622,6 @@ def get_absolute_url(self): def delete_caches(cls, ids): SerializerHelper.delete_caches(cls, ids) - @staticmethod - def flat_complete_header_row(): - return ( - "exposure-id", - "exposure-url", - "exposure-name", - "exposure-inhalation", - "exposure-dermal", - "exposure-oral", - "exposure-in_utero", - "exposure-iv", - "exposure-unknown_route", - "exposure-measured", - "exposure-metric", - "exposure-metric_units_id", - "exposure-metric_units_name", - "exposure-metric_description", - "exposure-analytical_method", - "exposure-sampling_period", - "exposure-age_of_exposure", - "exposure-duration", - "exposure-n", - "exposure-exposure_distribution", - "exposure-description", - "exposure-created", - "exposure-last_updated", - ) - - @staticmethod - def flat_complete_data_row(ser): - if ser is None: - ser = {} - units = ser.get("metric_units", {}) - return ( - ser.get("id"), - ser.get("url"), - ser.get("name"), - ser.get("inhalation"), - ser.get("dermal"), - ser.get("oral"), - ser.get("in_utero"), - ser.get("iv"), - ser.get("unknown_route"), - ser.get("measured"), - ser.get("metric"), - units.get("id"), - units.get("name"), - ser.get("metric_description"), - ser.get("analytical_method"), - ser.get("sampling_period"), - ser.get("age_of_exposure"), - ser.get("duration"), - ser.get("n"), - ser.get("exposure_distribution"), - ser.get("description"), - ser.get("created"), - ser.get("last_updated"), - ) - def get_study(self): return self.study_population.get_study() @@ -891,42 +683,6 @@ class Meta: def __str__(self): return f"{{CT id={self.id}, exposure={self.exposure}}}" - @staticmethod - def flat_complete_header_row(): - return ( - "central_tendency-id", - "central_tendency-estimate", - "central_tendency-estimate_type", - "central_tendency-variance", - "central_tendency-variance_type", - "central_tendency-lower_ci", - "central_tendency-upper_ci", - "central_tendency-lower_range", - "central_tendency-upper_range", - "central_tendency-description", - "central_tendency-lower_bound_interval", - "central_tendency-upper_bound_interval", - ) - - @staticmethod - def flat_complete_data_row(ser): - if ser is None: - ser = {} - return ( - ser.get("id"), - ser.get("estimate"), - ser.get("estimate_type"), - ser.get("variance"), - ser.get("variance_type"), - ser.get("lower_ci"), - ser.get("upper_ci"), - ser.get("lower_range"), - ser.get("upper_range"), - ser.get("description"), - ser.get("lower_bound_interval"), - ser.get("upper_bound_interval"), - ) - class GroupNumericalDescriptions(models.Model): objects = managers.GroupNumericalDescriptionsManager() @@ -1131,72 +887,6 @@ def get_assessment(self): def get_absolute_url(self): return reverse("epi:result_detail", args=(self.pk,)) - @staticmethod - def flat_complete_header_row(): - return ( - "metric-id", - "metric-name", - "metric-abbreviation", - "result-id", - "result-name", - "result-metric_description", - "result-metric_units", - "result-data_location", - "result-population_description", - "result-dose_response", - "result-dose_response_details", - "result-prevalence_incidence", - "result-statistical_power", - "result-statistical_power_details", - "result-statistical_test_results", - "result-trend_test", - "result-adjustment_factors", - "result-adjustment_factors_considered", - "result-estimate_type", - "result-variance_type", - "result-ci_units", - "result-comments", - "result-created", - "result-last_updated", - ) - - @staticmethod - def flat_complete_data_row(ser): - def getFactorList(lst, isIncluded): - return "|".join( - [ - d["description"] - for d in [d for d in lst if d["included_in_final_model"] == isIncluded] - ] - ) - - return ( - ser["metric"]["id"], - ser["metric"]["metric"], - ser["metric"]["abbreviation"], - ser["id"], - ser["name"], - ser["metric_description"], - ser["metric_units"], - ser["data_location"], - ser["population_description"], - ser["dose_response"], - ser["dose_response_details"], - ser["prevalence_incidence"], - ser["statistical_power"], - ser["statistical_power_details"], - ser["statistical_test_results"], - ser["trend_test"], - getFactorList(ser["factors"], True), - getFactorList(ser["factors"], False), - ser["estimate_type"], - ser["variance_type"], - ser["ci_units"], - ser["comments"], - ser["created"], - ser["last_updated"], - ) - def get_study(self): return self.outcome.get_study() @@ -1426,48 +1116,6 @@ def lower_bound_interval(self): def upper_bound_interval(self): return self.upper_range if self.upper_ci is None else self.upper_ci - @staticmethod - def flat_complete_header_row(): - return ( - "result_group-id", - "result_group-n", - "result_group-estimate", - "result_group-variance", - "result_group-lower_ci", - "result_group-upper_ci", - "result_group-lower_range", - "result_group-upper_range", - "result_group-lower_bound_interval", - "result_group-upper_bound_interval", - "result_group-p_value_qualifier", - "result_group-p_value", - "result_group-is_main_finding", - "result_group-main_finding_support", - "result_group-created", - "result_group-last_updated", - ) - - @staticmethod - def flat_complete_data_row(ser): - return ( - ser["id"], - ser["n"], - ser["estimate"], - ser["variance"], - ser["lower_ci"], - ser["upper_ci"], - ser["lower_range"], - ser["upper_range"], - ser["lower_bound_interval"], - ser["upper_bound_interval"], - ser["p_value_qualifier_display"], - ser["p_value"], - ser["is_main_finding"], - ser["main_finding_support"], - ser["created"], - ser["last_updated"], - ) - @staticmethod def stdev(variance_type, variance, n): # calculate stdev given re diff --git a/hawc/apps/invitro/api.py b/hawc/apps/invitro/api.py index 8689d2084a..1108faebb0 100644 --- a/hawc/apps/invitro/api.py +++ b/hawc/apps/invitro/api.py @@ -44,6 +44,47 @@ def full_export(self, request, pk): ) return Response(exporter.build_export()) + @action( + detail=True, + url_path="full-export2", + action_perms=AssessmentViewSetPermissions.CAN_VIEW_OBJECT, + renderer_classes=PandasRenderers, + ) + def full_export2(self, request, pk): + self.get_object() + self.object_list = self.get_endpoint_queryset() + exporter = exports.DataPivotEndpoint2( + self.object_list, filename=f"{self.assessment}-invitro" + ) + return Response(exporter.build_export()) + + @action( + detail=True, + url_path="group-export", + action_perms=AssessmentViewSetPermissions.CAN_VIEW_OBJECT, + renderer_classes=PandasRenderers, + ) + def group_export(self, request, pk): + self.get_object() + self.object_list = self.get_endpoint_queryset() + exporter = exports.DataPivotEndpointGroup( + self.object_list, filename=f"{self.assessment}-invitro" + ) + return Response(exporter.build_export()) + + @action( + detail=True, + url_path="group-export2", + action_perms=AssessmentViewSetPermissions.CAN_VIEW_OBJECT, + renderer_classes=PandasRenderers, + ) + def group_export2(self, request, pk): + self.get_object() + self.object_list = self.get_endpoint_queryset() + exporter = exports.DataPivotEndpointGroup2( + self.object_list, filename=f"{self.assessment}-invitro" + ) + return Response(exporter.build_export()) class IVChemical(AssessmentViewSet): assessment_filter_args = "study__assessment" diff --git a/hawc/apps/invitro/exports.py b/hawc/apps/invitro/exports.py index e05004de2b..9222829f80 100644 --- a/hawc/apps/invitro/exports.py +++ b/hawc/apps/invitro/exports.py @@ -1,10 +1,496 @@ from copy import copy +import math +import pandas as pd from django.apps import apps +from django.conf import settings + +from django.db.models import Exists, OuterRef from ..common.helper import FlatFileExporter from ..materialized.models import FinalRiskOfBiasScore from ..study.models import Study +from ..common.exports import Exporter, ModelExport +from ..common.helper import FlatFileExporter +from ..common.models import sql_display, sql_format, str_m2m +from ..materialized.models import FinalRiskOfBiasScore +from ..study.exports import StudyExport +from . import constants, models + + +def percent_control(n_1, mu_1, sd_1, n_2, mu_2, sd_2): + mean = low = high = None + + if mu_1 is not None and mu_2 is not None and mu_1 > 0 and mu_2 > 0: + mean = (mu_2 - mu_1) / mu_1 * 100.0 + if sd_1 and sd_2 and n_1 and n_2: + sd = math.sqrt( + pow(mu_1, -2) + * ((pow(sd_2, 2) / n_2) + (pow(mu_2, 2) * pow(sd_1, 2)) / (n_1 * pow(mu_1, 2))) + ) + ci = (1.96 * sd) * 100 + rng = sorted([mean - ci, mean + ci]) + low = rng[0] + high = rng[1] + + return mean, low, high + +class DSSToxExport(ModelExport): + def get_value_map(self): + return { + "dtxsid": "dtxsid", + "dashboard_url": "dashboard_url", + "img_url": "img_url", + "content": "content", + "created":"created", + "last_updated":"last_updated", + } + + def get_annotation_map(self, query_prefix): + img_url_str = f"https://api-ccte.epa.gov/chemical/file/image/search/by-dtxsid/{{}}?x-api-key={settings.CCTE_API_KEY}" if settings.CCTE_API_KEY else "https://comptox.epa.gov/dashboard-api/ccdapp1/chemical-files/image/by-dtxsid/{}" + return { + "dashboard_url": sql_format("https://comptox.epa.gov/dashboard/dsstoxdb/results?search={}", query_prefix + "dtxsid"), + "img_url": sql_format(img_url_str, query_prefix + "dtxsid"), + } + + +class IVChemicalExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "name": "name", + "cas": "cas", + "purity":"purity", + } + +class IVExperimentExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "dose_units": "dose_units__name", + "metabolic_activation": "metabolic_activation_display", + "transfection": "transfection", + } + + def get_annotation_map(self, query_prefix): + return { + "metabolic_activation_display": sql_display(query_prefix + "metabolic_activation", constants.MetabolicActivation), + + } + +class IVCellTypeExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "species": "species", + "strain": "strain", + "sex": "sex_display", + "cell_type":"cell_type", + "tissue":"tissue", + } + def get_annotation_map(self, query_prefix): + return { + "sex_display": sql_display(query_prefix + "sex", constants.Sex), + + } +class IVEndpointExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "name": "name", + "data_type":"data_type", + "variance_type":"variance_type", + "effects": "effects__name", + "assay_type": "assay_type", + "short_description":"short_description", + "response_units":"response_units", + "observation_time":"observation_time", + "observation_time_units":"observation_time_units_display", + "NOEL":"NOEL", + "LOEL":"LOEL", + "monotonicity":"monotonicity_display", + "overall_pattern":"overall_pattern_display", + "trend_test":"trend_test_display", + } + + def get_annotation_map(self, query_prefix): + return { + "effects__name": str_m2m(query_prefix + "effects__name"), + "observation_time_units_display": sql_display(query_prefix + "observation_time_units", constants.ObservationTimeUnits), + "monotonicity_display": sql_display(query_prefix + "monotonicity", constants.Monotonicity), + "overall_pattern_display": sql_display(query_prefix + "overall_pattern", constants.OverallPattern), + "trend_test_display": sql_display(query_prefix + "trend_test", constants.TrendTestResult), + } + +class IVEndpointGroupExport(ModelExport): + def get_value_map(self): + return { + "id":"id", + "dose_group_id": "dose_group_id", + "dose": "dose", + "n": "n", + "response":"response", + "variance":"variance", + "difference_control":"difference_control", + "difference_control_display":"difference_control_display", + "significant_control":"significant_control_display", + "cytotoxicity_observed":"cytotoxicity_observed_display", + "precipitation_observed":"precipitation_observed_display", + } + + def get_annotation_map(self, query_prefix): + Observation = type('Observation', (object,), {'choices': constants.OBSERVATION_CHOICES}) + return { + "difference_control_display": sql_display(query_prefix + "difference_control", constants.DifferenceControl), + "significant_control_display": sql_display(query_prefix + "significant_control", constants.Significance), + "cytotoxicity_observed_display": sql_display(query_prefix + "cytotoxicity_observed", Observation), + "precipitation_observed_display": sql_display(query_prefix + "precipitation_observed", Observation), + } + +class IVBenchmarkExport(ModelExport): + def get_value_map(self): + return { + "id":"id", + "benchmark": "benchmark", + "value": "value", + } +class InvitroExporter(Exporter): + + def build_modules(self) -> list[ModelExport]: + return [ + StudyExport( + "study", + "experiment__study", + include=("id","hero_id","pubmed_id","doi","short_citation","study_identifier","published") + ), + IVChemicalExport( + "iv_chemical", "chemical", + ), + DSSToxExport("dsstox","chemical__dtxsid",), + IVExperimentExport( + "iv_experiment", + "experiment", + ), + IVCellTypeExport("iv_cell_type", "experiment__cell_type",), + IVEndpointExport( + "iv_endpoint", + "", + exclude=("data_type","variance_type",) + ), + IVEndpointGroupExport( + "iv_endpoint_group", + "groups", + include=("id","dose","difference_control_display","significant_control","cytotoxicity_observed") + ), + IVBenchmarkExport("iv_benchmark","benchmarks",) + ] + + +class DataPivotEndpoint2(FlatFileExporter): + # TODO add category, otherwise done + + def handle_dsstox(self,df:pd.DataFrame)->pd.DataFrame: + # condenses the dsstox info into one column + dsstox_cols = [col for col in df.columns if col.startswith("dsstox-")] + dsstox_df = df[dsstox_cols] + dsstox_df.columns = dsstox_df.columns.str[7:] + df["chemical DTXSID"] = dsstox_df.to_dict(orient="records") + return df.drop(columns=dsstox_cols) + + + def handle_dose_groups(self,df:pd.DataFrame)->pd.DataFrame: + def _func(group_df: pd.DataFrame)->pd.Series: + # handle case with no dose groups + if group_df["iv_endpoint_group-id"].isna().all(): + group_df["number of doses"] = 0 + group_df["minimum dose"] = None + group_df["maximum dose"] = None + group_df["iv_endpoint-NOEL"] = None + group_df["iv_endpoint-LOEL"] = None + return group_df + # only interested in unique, non-control dose groups + unique_df = group_df.drop_duplicates(subset="iv_endpoint_group-id") + non_control_df = unique_df.loc[unique_df["iv_endpoint_group-dose"] > 0] + # add dose related columns + group_df["number of doses"] = non_control_df.shape[0] + group_df["minimum dose"] = non_control_df["iv_endpoint_group-dose"].min() + group_df["maximum dose"] = non_control_df["iv_endpoint_group-dose"].max() + NOEL_index = unique_df.iloc[0]["iv_endpoint-NOEL"] + group_df["iv_endpoint-NOEL"] = None if NOEL_index == -999 else unique_df.iloc[NOEL_index]["iv_endpoint_group-dose"] + LOEL_index = unique_df.iloc[0]["iv_endpoint-LOEL"] + group_df["iv_endpoint-LOEL"] = None if LOEL_index == -999 else unique_df.iloc[LOEL_index]["iv_endpoint_group-dose"] + for i,row in enumerate(non_control_df.itertuples(index=False,name=None),start=1): + group_df[f"Dose {i}"] = row[non_control_df.columns.get_loc("iv_endpoint_group-dose")] + group_df[f"Change Control {i}"] = row[non_control_df.columns.get_loc("iv_endpoint_group-difference_control_display")] + group_df[f"Significant {i}"] = row[non_control_df.columns.get_loc("iv_endpoint_group-significant_control")] + group_df[f"Cytotoxicity {i}"] = row[non_control_df.columns.get_loc("iv_endpoint_group-cytotoxicity_observed")] + # return a df that is dose group agnostic + return group_df.drop_duplicates(subset=group_df.columns[group_df.columns.str.endswith("-id")].difference(["iv_endpoint_group-id"])) + return df.groupby("iv_endpoint-id", group_keys=False).apply(_func).drop(columns=["iv_endpoint_group-id","iv_endpoint_group-dose","iv_endpoint_group-difference_control_display","iv_endpoint_group-significant_control","iv_endpoint_group-cytotoxicity_observed"]) + + + def handle_benchmarks(self,df:pd.DataFrame)->pd.DataFrame: + def _func(group_df:pd.DataFrame): + # handle case with no benchmarks + if group_df["iv_benchmark-id"].isna().all(): + # no need to deduplicate, since there should be + # only one benchmark id: None + return group_df + # only interested in unique benchmarks + unique_df = group_df.drop_duplicates(subset="iv_benchmark-id") + # add the benchmark columns + for i,row in enumerate(unique_df.itertuples(index=False,name=None),start=1): + group_df[f"Benchmark Type {i}"] = row[unique_df.columns.get_loc("iv_benchmark-benchmark")] + group_df[f"Benchmark Value {i}"] = row[unique_df.columns.get_loc("iv_benchmark-value")] + # return a df that is benchmark agnostic + return group_df.drop_duplicates(subset=group_df.columns[group_df.columns.str.endswith("-id")].difference(["iv_benchmark-id"])) + + return df.groupby("iv_endpoint-id", group_keys=False).apply(_func).drop(columns=["iv_benchmark-id","iv_benchmark-benchmark","iv_benchmark-value"]) + + + def build_df(self) -> pd.DataFrame: + df = InvitroExporter().get_df(self.queryset.select_related("experiment__study","chemical__dtxsid","experiment__cell_type").prefetch_related("groups","benchmarks").order_by("id", "groups","benchmarks")) + study_ids = list(df["study-id"].unique()) + rob_headers, rob_data = FinalRiskOfBiasScore.get_dp_export( + self.queryset.first().assessment_id, + study_ids, + "invitro", + ) + rob_df = pd.DataFrame( + data=[ + [rob_data[(study_id, metric_id)] for metric_id in rob_headers.keys()] + for study_id in study_ids + ], + columns=list(rob_headers.values()), + index=study_ids, + ) + df = df.join(rob_df, on="study-id") + + df["key"] = df["iv_endpoint-id"] + + df = self.handle_dose_groups(df) + df = self.handle_benchmarks(df) + df = self.handle_dsstox(df) + + + + + df = df.rename( + columns={ + "study-id":"study id", + "study-hero_id":"study hero_id", + "study-pubmed_id":"study pubmed_id", + "study-doi":"study doi", + "study-short_citation": "study name", + "study-study_identifier": "study identifier", + "study-published": "study published", + } + ) + df = df.rename( + columns={ + "iv_chemical-id":"chemical id", + "iv_chemical-name":"chemical name", + "iv_chemical-cas":"chemical CAS", + "iv_chemical-purity":"chemical purity", + "iv_experiment-id":"IVExperiment id", + "iv_experiment-dose_units":"Dose units", + "iv_experiment-metabolic_activation":"Metabolic activation", + "iv_experiment-transfection":"Transfection", + "iv_cell_type-id":"IVCellType id", + "iv_cell_type-species":"cell species", + "iv_cell_type-strain":"cell strain", + "iv_cell_type-sex":"cell sex", + "iv_cell_type-cell_type":"cell type", + "iv_cell_type-tissue":"cell tissue", + "iv_endpoint-id":"IVEndpoint id", + "iv_endpoint-name":"IVEndpoint name", + "iv_endpoint-effects":"IVEndpoint description tags", + "iv_endpoint-assay_type":"assay type", + "iv_endpoint-short_description":"endpoint description", + "iv_endpoint-response_units":"endpoint response units", + "iv_endpoint-observation_time":"observation time", + "iv_endpoint-observation_time_units":"observation time units", + "iv_endpoint-NOEL":"NOEL", + "iv_endpoint-LOEL":"LOEL", + "iv_endpoint-monotonicity":"monotonicity", + "iv_endpoint-overall_pattern":"overall pattern", + "iv_endpoint-trend_test":"trend test result", + } + ) + + return df + +class InvitroGroupExporter(Exporter): + + def build_modules(self) -> list[ModelExport]: + return [ + StudyExport( + "study", + "experiment__study", + include=("id","short_citation","study_identifier","published") + ), + IVChemicalExport( + "iv_chemical", "chemical", + ), + DSSToxExport("dsstox","chemical__dtxsid",), + IVExperimentExport( + "iv_experiment", + "experiment", + ), + IVCellTypeExport("iv_cell_type", "experiment__cell_type",), + IVEndpointExport( + "iv_endpoint", + "", + ), + IVEndpointGroupExport( + "iv_endpoint_group", + "groups", + exclude=("difference_control_display",) + ), + ] + +class DataPivotEndpointGroup2(FlatFileExporter): + + def collapse_dsstox(self,df:pd.DataFrame): + # condenses the dsstox info into one column + dsstox_cols = [col for col in df.columns if col.startswith("dsstox-")] + dsstox_df = df[dsstox_cols] + dsstox_df.columns = dsstox_df.columns.str[7:] + df["chemical DTXSID"] = dsstox_df.to_dict(orient="records") + return df.drop(columns=dsstox_cols) + + def add_stdevs(self,df): + df["stdev"] = df.apply(lambda x:models.IVEndpointGroup.stdev(x["iv_endpoint-variance_type"],x["iv_endpoint_group-variance"],x["iv_endpoint_group-n"]),axis="columns") + return df.drop(columns=["iv_endpoint-variance_type","iv_endpoint_group-variance"]) + + + + def _add_percent_control(self, df: pd.DataFrame) -> pd.DataFrame: + + + + def _apply_results(_df1: pd.DataFrame): + control = _df1.iloc[0] + + _df1["low_dose"] = _df1["iv_endpoint_group-dose"].loc[lambda x: x > 0].min() + _df1["high_dose"] = _df1["iv_endpoint_group-dose"].loc[lambda x: x > 0].max() + + _df1["iv_endpoint-NOEL"] = None if control["iv_endpoint-NOEL"] == -999 else _df1.iloc[control["iv_endpoint-NOEL"]]["iv_endpoint_group-dose"] + _df1["iv_endpoint-LOEL"] = None if control["iv_endpoint-LOEL"] == -999 else _df1.iloc[control["iv_endpoint-LOEL"]]["iv_endpoint_group-dose"] + + data_type = control["iv_endpoint-data_type"] + n_1 = control["iv_endpoint_group-n"] + mu_1 = control["iv_endpoint_group-response"] + sd_1 = control["stdev"] + + def _apply_result_groups(test: pd.Series): + + if data_type == constants.DataType.CONTINUOUS: + n_2 = test["iv_endpoint_group-n"] + mu_2 = test["iv_endpoint_group-response"] + sd_2 = test["stdev"] + test["percent control mean"], test["percent control low"], test["percent control high"] = percent_control(n_1, mu_1, sd_1, n_2, mu_2, sd_2) + elif data_type == constants.DataType.DICHOTOMOUS: + # TODO this seems to be a dead conditional; + # invitro has no 'incidence' variables so + # nothing is ever computed here + pass + return test + + + return _df1.apply(_apply_result_groups,axis="columns") + + + results = df.groupby("iv_endpoint-id", group_keys=False) + computed_df = results.apply(_apply_results) + return computed_df.drop(columns="iv_endpoint-data_type") + + def build_df(self) -> pd.DataFrame: + df = InvitroGroupExporter().get_df(self.queryset.select_related("experiment__study","chemical__dtxsid","experiment__cell_type").prefetch_related("groups").filter(Exists(models.IVEndpointGroup.objects.filter(endpoint=OuterRef('pk')))).order_by("id", "groups")) + study_ids = list(df["study-id"].unique()) + rob_headers, rob_data = FinalRiskOfBiasScore.get_dp_export( + self.queryset.first().assessment_id, + study_ids, + "invitro", + ) + rob_df = pd.DataFrame( + data=[ + [rob_data[(study_id, metric_id)] for metric_id in rob_headers.keys()] + for study_id in study_ids + ], + columns=list(rob_headers.values()), + index=study_ids, + ) + df = df.join(rob_df, on="study-id") + + + df = self.add_stdevs(df) + df = self._add_percent_control(df) + df = self.collapse_dsstox(df) + df["iv_endpoint_group-difference_control"] = df["iv_endpoint_group-difference_control"].map(models.IVEndpointGroup.DIFFERENCE_CONTROL_SYMBOLS) + + df["key"] = df["iv_endpoint_group-id"] + df = df.drop(columns=["iv_endpoint_group-id"]) + + df = df.rename( + columns={ + "study-id":"study id", + "study-short_citation": "study name", + "study-study_identifier": "study identifier", + "study-published": "study published", + } + ) + df = df.rename( + columns={ + "iv_chemical-id":"chemical id", + "iv_chemical-name":"chemical name", + "iv_chemical-cas":"chemical CAS", + "iv_chemical-dtxsid":"chemical DTXSID", + "iv_chemical-purity":"chemical purity", + "iv_experiment-id":"IVExperiment id", + "iv_experiment-dose_units":"dose units", + "iv_experiment-metabolic_activation":"metabolic activation", + "iv_experiment-transfection":"transfection", + "iv_cell_type-id":"IVCellType id", + "iv_cell_type-species":"cell species", + "iv_cell_type-strain":"cell strain", + "iv_cell_type-sex":"cell sex", + "iv_cell_type-cell_type":"cell type", + "iv_cell_type-tissue":"cell tissue", + "iv_endpoint-id":"IVEndpoint id", + "iv_endpoint-name":"IVEndpoint name", + "iv_endpoint-effects":"IVEndpoint description tags", + "iv_endpoint-assay_type":"assay type", + "iv_endpoint-short_description":"endpoint description", + "iv_endpoint-response_units":"endpoint response units", + "iv_endpoint-observation_time":"observation time", + "iv_endpoint-observation_time_units":"observation time units", + "iv_endpoint-NOEL":"NOEL", + "iv_endpoint-LOEL":"LOEL", + "iv_endpoint-monotonicity":"monotonicity", + "iv_endpoint-overall_pattern":"overall pattern", + "iv_endpoint-trend_test":"trend test result", + + } + ) + + df = df.rename( + columns={ + "iv_endpoint_group-dose_group_id":"dose index", + "iv_endpoint_group-dose":"dose", + "iv_endpoint_group-n":"N", + "iv_endpoint_group-response":"response", + "iv_endpoint_group-difference_control":"change from control", + "iv_endpoint_group-significant_control":"significant from control", + "iv_endpoint_group-cytotoxicity_observed":"cytotoxicity observed", + "iv_endpoint_group-precipitation_observed":"precipitation observed", + } + ) + return df + + + + def getDose(ser, tag): @@ -129,6 +615,7 @@ def _get_data_rows(self): doses.pop(0) diffs.pop(0) sigs.pop(0) + #cytotoxes.pop(0) number_doses = len(doses) diff --git a/hawc/apps/study/exports.py b/hawc/apps/study/exports.py index 3e6f1a4a8b..8f8cc2289e 100644 --- a/hawc/apps/study/exports.py +++ b/hawc/apps/study/exports.py @@ -3,6 +3,7 @@ from django.db.models import Q from ..common.exports import ModelExport +from ..common.helper import cleanHTML from ..common.models import sql_display, sql_format, str_m2m from ..lit.constants import ReferenceDatabase from .constants import CoiReported @@ -54,7 +55,11 @@ def get_annotation_map(self, query_prefix): def prepare_df(self, df): for key in [self.get_column_name("pubmed_id"), self.get_column_name("hero_id")]: - df[key] = pd.to_numeric(df[key], errors="coerce") + if key in df.columns: + df[key] = pd.to_numeric(df[key], errors="coerce") for key in [self.get_column_name("doi")]: - df[key] = df[key].replace("", np.nan) + if key in df.columns: + df[key] = df[key].replace("", np.nan) + if (key := self.get_column_name("summary")) in df.columns: + df.loc[:, key] = df[key].apply(cleanHTML) return df diff --git a/tests/data/api/api-dp-data-epi.json b/tests/data/api/api-dp-data-epi.json index 0e90856514..e30adfc574 100644 --- a/tests/data/api/api-dp-data-epi.json +++ b/tests/data/api/api-dp-data-epi.json @@ -52,7 +52,7 @@ "result name": "partial PTSD", "result population description": "", "result summary": "", - "result tags": "|tag2|", + "result tags": "tag2", "statistical metric": "other", "statistical metric abbreviation": "oth", "statistical metric description": "count", @@ -68,7 +68,7 @@ "study population name": "Tokyo subway victims", "study population source": "", "study published": true, - "tags": "|tag2|", + "tags": "tag2", "upper CI": null, "upper bound interval": null, "upper range": null, @@ -127,7 +127,7 @@ "result name": "partial PTSD", "result population description": "", "result summary": "", - "result tags": "|tag2|", + "result tags": "tag2", "statistical metric": "other", "statistical metric abbreviation": "oth", "statistical metric description": "count", @@ -143,7 +143,7 @@ "study population name": "Tokyo subway victims", "study population source": "", "study published": true, - "tags": "|tag2|", + "tags": "tag2", "upper CI": null, "upper bound interval": null, "upper range": null, @@ -202,7 +202,7 @@ "result name": "partial PTSD", "result population description": "", "result summary": "", - "result tags": "|tag2|", + "result tags": "tag2", "statistical metric": "other", "statistical metric abbreviation": "oth", "statistical metric description": "count", @@ -218,7 +218,7 @@ "study population name": "Tokyo subway victims", "study population source": "", "study published": true, - "tags": "|tag2|", + "tags": "tag2", "upper CI": null, "upper bound interval": null, "upper range": null,