diff --git a/hawc/apps/animal/exports.py b/hawc/apps/animal/exports.py index 4f5beced51..b6c0a47de5 100644 --- a/hawc/apps/animal/exports.py +++ b/hawc/apps/animal/exports.py @@ -1,744 +1,1514 @@ -from copy import copy +import math +from collections import defaultdict + +import numpy as np +import pandas as pd +from django.db.models import CharField, F +from django.db.models.functions import Cast +from django.db.models.lookups import Exact +from scipy import stats from ..assessment.models import DoseUnits -from ..common.helper import FlatFileExporter +from ..bmd.models import Session +from ..common.exports import Exporter, ModelExport +from ..common.helper import FlatFileExporter, cleanHTML +from ..common.models import sql_display, sql_format, str_m2m from ..materialized.models import FinalRiskOfBiasScore -from ..study.models import Study +from ..study.exports import StudyExport from . import constants, models -def get_gen_species_strain_sex(e, withN=False): - gen = e["animal_group"]["generation"] - if len(gen) > 0: - gen += " " - - ns_txt = "" - if withN: - ns = [eg["n"] for eg in e["groups"] if eg["n"] is not None] - if len(ns) > 0: - ns_txt = ", N=" + models.EndpointGroup.getNRangeText(ns) - - sex_symbol = e["animal_group"]["sex_symbol"] - if sex_symbol == "NR": - sex_symbol = "sex=NR" - - return ( - f"{gen}{e['animal_group']['species']}, {e['animal_group']['strain']} ({sex_symbol}{ns_txt})" - ) +def cont_ci(stdev, n, response): + """ + Two-tailed t-test, assuming 95% confidence interval. + """ + se = stdev / math.sqrt(n) + change = stats.t.ppf(0.975, max(n - 1, 1)) * se + lower_ci = response - change + upper_ci = response + change + return lower_ci, upper_ci -def get_treatment_period(exp, dr): - txt = exp["type"].lower() - if txt.find("(") >= 0: - txt = txt[: txt.find("(")] +def dich_ci(incidence, n): + """ + Add confidence intervals to dichotomous datasets. + https://www.epa.gov/sites/production/files/2020-09/documents/bmds_3.2_user_guide.pdf - if dr["duration_exposure_text"]: - txt = f"{txt} ({dr['duration_exposure_text']})" + The error bars shown in BMDS plots use alpha = 0.05 and so + represent the 95% confidence intervals on the observed + proportions (independent of model). + """ + p = incidence / float(n) + z = stats.norm.ppf(1 - 0.05 / 2) + z2 = z * z + q = 1.0 - p + tmp1 = 2 * n * p + z2 + lower_ci = ((tmp1 - 1) - z * np.sqrt(z2 - (2 + 1 / n) + 4 * p * (n * q + 1))) / (2 * (n + z2)) + upper_ci = ((tmp1 + 1) + z * np.sqrt(z2 + (2 + 1 / n) + 4 * p * (n * q - 1))) / (2 * (n + z2)) + return lower_ci, upper_ci + + +def percent_control(n_1, mu_1, sd_1, n_2, mu_2, sd_2): + mean = low = high = None + + if mu_1 is not None and mu_2 is not None and mu_1 > 0 and mu_2 > 0: + mean = (mu_2 - mu_1) / mu_1 * 100.0 + if sd_1 and sd_2 and n_1 and n_2: + sd = math.sqrt( + pow(mu_1, -2) + * ((pow(sd_2, 2) / n_2) + (pow(mu_2, 2) * pow(sd_1, 2)) / (n_1 * pow(mu_1, 2))) + ) + ci = (1.96 * sd) * 100 + rng = sorted([mean - ci, mean + ci]) + low = rng[0] + high = rng[1] - return txt + return mean, low, high -def get_significance_and_direction(data_type, groups): +def maximum_percent_control_change(changes: list): """ - Get significance and direction; return all possible values as strings. + For each endpoint, return the maximum absolute-change percent control + for that endpoint, or 0 if it cannot be calculated. Useful for + ordering data-pivot results. """ - significance_list = [] - - if len(groups) == 0: - return significance_list - - if data_type in { - constants.DataType.CONTINUOUS, - constants.DataType.PERCENT_DIFFERENCE, - constants.DataType.DICHOTOMOUS, - constants.DataType.DICHOTOMOUS_CANCER, - }: - if data_type in { - constants.DataType.CONTINUOUS, - constants.DataType.PERCENT_DIFFERENCE, - }: - field = "response" - elif data_type in { - constants.DataType.DICHOTOMOUS, - constants.DataType.DICHOTOMOUS_CANCER, - }: - field = "percent_affected" - else: - raise ValueError(f"Unreachable code? data_type={data_type}") - control_resp = groups[0][field] - for group in groups: - if group["significant"]: - resp = group[field] - if control_resp is None or resp is None or resp == control_resp: - significance_list.append("Yes - ?") - elif resp > control_resp: - significance_list.append("Yes - ↑") - else: - significance_list.append("Yes - ↓") - else: - significance_list.append("No") - elif data_type == constants.DataType.NR: - for group in groups: - significance_list.append("?") - else: - raise ValueError("Unreachable code - unable to determine significance/direction") - - return significance_list + val = 0 + + if len(changes) > 0: + min_ = min(changes) + max_ = max(changes) + val = min_ if abs(min_) > abs(max_) else max_ + + return val + + +class ExperimentExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "url": "url", + "name": "name", + "type_display": "type_display", + "has_multiple_generations": "has_multiple_generations", + "chemical": "chemical", + "cas": "cas", + "dtxsid": "dtxsid", + "chemical_source": "chemical_source", + "purity_available": "purity_available", + "purity_qualifier": "purity_qualifier", + "purity": "purity", + "vehicle": "vehicle", + "guideline_compliance": "guideline_compliance", + "description": "description", + } + + def get_annotation_map(self, query_prefix): + return { + "url": sql_format("/ani/experiment/{}/", query_prefix + "id"), # hardcoded URL + "type_display": sql_display(query_prefix + "type", constants.ExperimentType), + } + + def prepare_df(self, df): + # clean html text + description = self.get_column_name("description") + if description in df.columns: + df.loc[:, description] = df[description].apply(cleanHTML) + return df + + +class AnimalGroupExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "url": "url", + "name": "name", + "sex_display": "sex_display", + "sex_symbol": "sex_symbol", + "animal_source": "animal_source", + "lifestage_exposed": "lifestage_exposed", + "lifestage_assessed": "lifestage_assessed", + "siblings": "siblings", + "parents_display": "parents_display", + "generation": "generation", + "comments": "comments", + "diet": "diet", + "species_name": "species__name", + "strain_name": "strain__name", + } + + def get_annotation_map(self, query_prefix): + return { + "url": sql_format("/ani/animal-group/{}/", query_prefix + "id"), # hardcoded URL + "sex_display": sql_display(query_prefix + "sex", constants.Sex), + "sex_symbol": sql_display(query_prefix + "sex", models.AnimalGroup.SEX_SYMBOLS), + "parents_display": str_m2m(Cast(query_prefix + "parents", output_field=CharField())), + } + + def prepare_df(self, df): + # clean html text + comments = self.get_column_name("comments") + if comments in df.columns: + df.loc[:, comments] = df[comments].apply(cleanHTML) + return df + + +class DosingRegimeExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "dosed_animals": "dosed_animals", + "route_of_exposure_display": "route_of_exposure_display", + "duration_exposure": "duration_exposure", + "duration_exposure_text": "duration_exposure_text", + "duration_observation": "duration_observation", + "num_dose_groups": "num_dose_groups", + "positive_control_display": "positive_control_display", + "negative_control_display": "negative_control_display", + "description": "description", + } + + def get_annotation_map(self, query_prefix): + POSITIVE_CONTROL = {k: v for k, v in constants.POSITIVE_CONTROL_CHOICES} + return { + "route_of_exposure_display": sql_display( + query_prefix + "route_of_exposure", constants.RouteExposure + ), + "positive_control_display": sql_display( + query_prefix + "positive_control", POSITIVE_CONTROL + ), + "negative_control_display": sql_display( + query_prefix + "negative_control", constants.NegativeControl + ), + } + + def prepare_df(self, df): + # clean html text + description = self.get_column_name("description") + if description in df.columns: + df.loc[:, description] = df[description].apply(cleanHTML) + return df + + +class EndpointExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "url": "url", + "name": "name", + "effects_display": "effects_display", + "system": "system", + "organ": "organ", + "effect": "effect", + "effect_subtype": "effect_subtype", + "name_term_id": "name_term_id", + "system_term_id": "system_term_id", + "organ_term_id": "organ_term_id", + "effect_term_id": "effect_term_id", + "effect_subtype_term_id": "effect_subtype_term_id", + "litter_effects": "litter_effects", + "litter_effect_notes": "litter_effect_notes", + "observation_time": "observation_time", + "observation_time_units_display": "observation_time_units_display", + "observation_time_text": "observation_time_text", + "data_location": "data_location", + "response_units": "response_units", + "data_type": "data_type", + "data_type_display": "data_type_display", + "variance_type": "variance_type", + "variance_type_name": "variance_type_name", + "confidence_interval": "confidence_interval", + "data_reported": "data_reported", + "data_extracted": "data_extracted", + "values_estimated": "values_estimated", + "expected_adversity_direction": "expected_adversity_direction", + "expected_adversity_direction_display": "expected_adversity_direction_display", + "monotonicity_display": "monotonicity_display", + "statistical_test": "statistical_test", + "trend_value": "trend_value", + "trend_result_display": "trend_result_display", + "diagnostic": "diagnostic", + "power_notes": "power_notes", + "results_notes": "results_notes", + "endpoint_notes": "endpoint_notes", + "additional_fields": "additional_fields", + } + + def get_annotation_map(self, query_prefix): + return { + "url": sql_format("/ani/endpoint/{}/", query_prefix + "id"), # hardcoded URL + "effects_display": str_m2m(query_prefix + "effects__name"), + "observation_time_units_display": sql_display( + query_prefix + "observation_time_units", constants.ObservationTimeUnits + ), + "data_type_display": sql_display(query_prefix + "data_type", constants.DataType), + "variance_type_name": sql_display( + query_prefix + "variance_type", models.Endpoint.VARIANCE_NAME + ), + "expected_adversity_direction_display": sql_display( + query_prefix + "expected_adversity_direction", constants.AdverseDirection + ), + "monotonicity_display": sql_display( + query_prefix + "monotonicity", constants.Monotonicity + ), + "trend_result_display": sql_display( + query_prefix + "trend_result", constants.TrendResult + ), + } + + def prepare_df(self, df): + # clean html text + results_notes = self.get_column_name("results_notes") + if results_notes in df.columns: + df.loc[:, results_notes] = df[results_notes].apply(cleanHTML) + + endpoint_notes = self.get_column_name("endpoint_notes") + if results_notes in df.columns: + df.loc[:, endpoint_notes] = df[endpoint_notes].apply(cleanHTML) + + return df + + +class EndpointGroupExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "dose_group_id": "dose_group_id", + "n": "n", + "incidence": "incidence", + "response": "response", + "variance": "variance", + "lower_ci": "lower_ci", + "upper_ci": "upper_ci", + "significant": "significant", + "significance_level": "significance_level", + "treatment_effect": "treatment_effect", + "treatment_effect_display": "treatment_effect_display", + "NOEL": "NOEL", + "LOEL": "LOEL", + "FEL": "FEL", + } + + def get_annotation_map(self, query_prefix): + return { + "treatment_effect_display": sql_display( + query_prefix + "treatment_effect", constants.TreatmentEffect, default=None + ), + "NOEL": Exact(F(query_prefix + "dose_group_id"), F(query_prefix + "endpoint__NOEL")), + "LOEL": Exact(F(query_prefix + "dose_group_id"), F(query_prefix + "endpoint__LOEL")), + "FEL": Exact(F(query_prefix + "dose_group_id"), F(query_prefix + "endpoint__FEL")), + } + + +class DoseGroupExport(ModelExport): + def get_value_map(self): + return { + "id": "id", + "dose_units_id": "dose_units__id", + "dose_units_name": "dose_units__name", + "dose_group_id": "dose_group_id", + "dose": "dose", + } + + +class EndpointGroupFlatCompleteExporter(Exporter): + def build_modules(self) -> list[ModelExport]: + return [ + StudyExport( + "study", + "animal_group__experiment__study", + ), + ExperimentExport( + "experiment", + "animal_group__experiment", + ), + AnimalGroupExport("animal_group", "animal_group", exclude=("sex_symbol",)), + DosingRegimeExport( + "dosing_regime", + "animal_group__dosing_regime", + ), + EndpointExport( + "endpoint", "", exclude=("expected_adversity_direction", "data_type_display") + ), + EndpointGroupExport("endpoint_group", "groups", exclude=("treatment_effect",)), + DoseGroupExport( + "dose_group", "animal_group__dosing_regime__doses", exclude=("dose_units_id",) + ), + ] class EndpointGroupFlatComplete(FlatFileExporter): - """ - Returns a complete export of all data required to rebuild the the - animal bioassay study type from scratch. - """ - - def _get_header_row(self): - self.doses = DoseUnits.objects.get_animal_units_names(self.kwargs.get("assessment")) - - header = [] - header.extend(Study.flat_complete_header_row()) - header.extend(models.Experiment.flat_complete_header_row()) - header.extend(models.AnimalGroup.flat_complete_header_row()) - header.extend(models.DosingRegime.flat_complete_header_row()) - header.extend(models.Endpoint.flat_complete_header_row()) - header.extend([f"doses-{d}" for d in self.doses]) - header.extend(models.EndpointGroup.flat_complete_header_row()) - return header - - def _get_data_rows(self): - rows = [] - identifiers_df = Study.identifiers_df(self.queryset, "animal_group__experiment__study_id") - for obj in self.queryset: - ser = obj.get_json(json_encode=False) - row = [] - row.extend( - Study.flat_complete_data_row( - ser["animal_group"]["experiment"]["study"], identifiers_df + def handle_doses(self, df: pd.DataFrame) -> pd.DataFrame: + # TODO this is really slow; maybe its the filtering to find matching dose group ids? + # solutions: ?, put the burden on SQL w/ Prefetch and Subquery (messy) + # long term solutions: group and dose group should be related + def _func(group_df: pd.DataFrame) -> pd.DataFrame: + # handle case with no dose data + if group_df["dose_group-id"].isna().all(): + return group_df + + # add dose data + group_df["doses-" + group_df["dose_group-dose_units_name"]] = group_df[ + "dose_group-dose" + ].tolist() + + # return a df that is dose agnostic + return group_df.drop_duplicates( + subset=group_df.columns[group_df.columns.str.endswith("-id")].difference( + ["dose_group-id"] ) ) - row.extend(models.Experiment.flat_complete_data_row(ser["animal_group"]["experiment"])) - row.extend(models.AnimalGroup.flat_complete_data_row(ser["animal_group"])) - ser_dosing_regime = ser["animal_group"]["dosing_regime"] - row.extend(models.DosingRegime.flat_complete_data_row(ser_dosing_regime)) - row.extend(models.Endpoint.flat_complete_data_row(ser)) - for i, eg in enumerate(ser["groups"]): - row_copy = copy(row) - ser_doses = ser_dosing_regime["doses"] if ser_dosing_regime else None - row_copy.extend( - models.DoseGroup.flat_complete_data_row(ser_doses, self.doses, i) - if ser_doses - else [None for _ in self.doses] - ) - row_copy.extend(models.EndpointGroup.flat_complete_data_row(eg, ser)) - rows.append(row_copy) - return rows + return ( + df.groupby("endpoint_group-id", group_keys=False, sort=False) + .apply(_func) + .drop( + columns=[ + "dose_group-id", + "dose_group-dose_units_name", + "dose_group-dose_group_id", + "dose_group-dose", + ] + ) + .reset_index(drop=True) + ) + + def handle_stdev(self, df: pd.DataFrame) -> pd.DataFrame: + df["endpoint_group-stdev"] = df.apply( + lambda x: models.EndpointGroup.stdev( + x["endpoint-variance_type"], + x["endpoint_group-variance"], + x["endpoint_group-n"], + ), + axis="columns", + ) + return df.drop(columns=["endpoint-variance_type"]) + + def handle_ci(self, df: pd.DataFrame) -> pd.DataFrame: + def _func(row: pd.Series) -> pd.Series: + # logic used from EndpointGroup.getConfidenceIntervals() + data_type = row["endpoint-data_type"] + lower_ci = row["endpoint_group-lower_ci"] + upper_ci = row["endpoint_group-upper_ci"] + n = row["endpoint_group-n"] + + response = row["endpoint_group-response"] + stdev = row["endpoint_group-stdev"] + incidence = row["endpoint_group-incidence"] + if lower_ci is not None or upper_ci is not None or n is None or n <= 0: + pass + elif ( + data_type == constants.DataType.CONTINUOUS + and response is not None + and stdev is not None + ): + ( + row["endpoint_group-lower_ci"], + row["endpoint_group-upper_ci"], + ) = cont_ci(stdev, n, response) + elif ( + data_type in [constants.DataType.DICHOTOMOUS, constants.DataType.DICHOTOMOUS_CANCER] + and incidence is not None + ): + ( + row["endpoint_group-lower_ci"], + row["endpoint_group-upper_ci"], + ) = dich_ci(incidence, n) + return row + + return df.apply(_func, axis="columns").drop(columns=["endpoint_group-stdev"]) + + def build_df(self) -> pd.DataFrame: + df = EndpointGroupFlatCompleteExporter().get_df( + self.queryset.select_related( + "animal_group__experiment__study", + "animal_group__dosing_regime", + ) + .prefetch_related("groups", "animal_group__dosing_regime__doses") + .order_by("id", "groups", "animal_group__dosing_regime__doses") + ) + df = df[ + pd.isna(df["dose_group-id"]) + | (df["endpoint_group-dose_group_id"] == df["dose_group-dose_group_id"]) + ] + if df.empty: + return df + if obj := self.queryset.first(): + doses = DoseUnits.objects.get_animal_units_names(obj.assessment_id) + + df = df.assign(**{f"doses-{d}": None for d in doses}) + df = self.handle_doses(df) + df["dosing_regime-dosed_animals"] = df["dosing_regime-dosed_animals"].astype(str) + df = self.handle_stdev(df) + df = self.handle_ci(df) + + df = df.rename( + columns={ + "endpoint_group-treatment_effect_display": "endpoint_group-treatment_effect", + "endpoint-expected_adversity_direction_display": "endpoint-expected_adversity_direction", + "experiment-type_display": "experiment-type", + "animal_group-sex_display": "animal_group-sex", + "dosing_regime-positive_control_display": "dosing_regime-positive_control", + "endpoint-effects_display": "endpoint-effects", + "animal_group-parents_display": "animal_group-parents", + "dosing_regime-route_of_exposure_display": "dosing_regime-route_of_exposure", + "endpoint-monotonicity_display": "endpoint-monotonicity", + "dosing_regime-negative_control_display": "dosing_regime-negative_control", + "endpoint-observation_time_units_display": "endpoint-observation_time_units", + "endpoint-trend_result_display": "endpoint-trend_result", + "endpoint-variance_type_name": "endpoint-variance_type", + "animal_group-species_name": "species-name", + "animal_group-strain_name": "strain-name", + } + ) + + return df + + +class EndpointGroupFlatDataPivotExporter(Exporter): + def build_modules(self) -> list[ModelExport]: + return [ + StudyExport( + "study", + "animal_group__experiment__study", + include=("id", "short_citation", "study_identifier", "published"), + ), + ExperimentExport( + "experiment", + "animal_group__experiment", + include=("id", "name", "type_display", "chemical"), + ), + AnimalGroupExport( + "animal_group", + "animal_group", + include=( + "id", + "name", + "lifestage_exposed", + "lifestage_assessed", + "species_name", + "strain_name", + "generation", + "sex_display", + "sex_symbol", + ), + ), + DosingRegimeExport( + "dosing_regime", + "animal_group__dosing_regime", + include=( + "route_of_exposure_display", + "duration_exposure_text", + "duration_exposure", + ), + ), + EndpointExport( + "endpoint", + "", + include=( + "id", + "name", + "system", + "organ", + "effect", + "effect_subtype", + "diagnostic", + "effects_display", + "observation_time", + "observation_time_units_display", + "observation_time_text", + "variance_type", + "data_type", + "data_type_display", + "trend_value", + "trend_result_display", + "expected_adversity_direction", + "response_units", + ), + ), + EndpointGroupExport( + "endpoint_group", + "groups", + include=( + "id", + "dose_group_id", + "n", + "incidence", + "response", + "lower_ci", + "upper_ci", + "significant", + "significance_level", + "treatment_effect_display", + "NOEL", + "LOEL", + "FEL", + "variance", + ), + ), + DoseGroupExport( + "dose_group", + "animal_group__dosing_regime__doses", + include=("id", "dose_units_id", "dose_units_name", "dose_group_id", "dose"), + ), + ] class EndpointGroupFlatDataPivot(FlatFileExporter): - """ - Return a subset of frequently-used data for generation of data-pivot - visualizations. - """ - - @classmethod - def _get_doses_list(cls, ser, preferred_units): - # compact the dose-list to only one set of dose-units; using the - # preferred units if available, else randomly get first available - units_id = None - + def get_preferred_units(self, df: pd.DataFrame) -> int | None: + preferred_units = self.kwargs.get("preferred_units", None) + available_units = df["dose_group-dose_units_id"].dropna().unique() + if available_units.size == 0: + return None if preferred_units: - available_units = set( - [d["dose_units"]["id"] for d in ser["animal_group"]["dosing_regime"]["doses"]] - ) for units in preferred_units: if units in available_units: - units_id = units - break + return units + return available_units[0] + + def handle_ci(self, df: pd.DataFrame) -> pd.DataFrame: + def _func(row: pd.Series) -> pd.Series: + # logic used from EndpointGroup.getConfidenceIntervals() + data_type = row["endpoint-data_type"] + lower_ci = row["endpoint_group-lower_ci"] + upper_ci = row["endpoint_group-upper_ci"] + n = row["endpoint_group-n"] + + response = row["endpoint_group-response"] + stdev = row["endpoint_group-stdev"] + incidence = row["endpoint_group-incidence"] + if lower_ci is not None or upper_ci is not None or n is None or n <= 0: + pass + elif ( + data_type == constants.DataType.CONTINUOUS + and response is not None + and stdev is not None + ): + ( + row["endpoint_group-lower_ci"], + row["endpoint_group-upper_ci"], + ) = cont_ci(stdev, n, response) + elif ( + data_type in [constants.DataType.DICHOTOMOUS, constants.DataType.DICHOTOMOUS_CANCER] + and incidence is not None + ): + ( + row["endpoint_group-lower_ci"], + row["endpoint_group-upper_ci"], + ) = dich_ci(incidence, n) + return row + + return df.apply(_func, axis="columns") + + def handle_stdev(self, df: pd.DataFrame) -> pd.DataFrame: + df["endpoint_group-stdev"] = df.apply( + lambda x: models.EndpointGroup.stdev( + x["endpoint-variance_type"], + x["endpoint_group-variance"], + x["endpoint_group-n"], + ), + axis="columns", + ) + return df + + def handle_percent_control(self, df: pd.DataFrame) -> pd.DataFrame: + def _func(group_df: pd.DataFrame) -> pd.DataFrame: + control = group_df.iloc[0] + + data_type = control["endpoint-data_type"] + i_1 = control["endpoint_group-incidence"] + n_1 = control["endpoint_group-n"] + mu_1 = control["endpoint_group-response"] + sd_1 = control["endpoint_group-stdev"] + + def __func(row: pd.Series) -> pd.Series: + # logic used from EndpointGroup.percentControl() + row["percent control mean"] = None + row["percent control low"] = None + row["percent control high"] = None + if data_type == constants.DataType.CONTINUOUS: + n_2 = row["endpoint_group-n"] + mu_2 = row["endpoint_group-response"] + sd_2 = row["endpoint_group-stdev"] + ( + row["percent control mean"], + row["percent control low"], + row["percent control high"], + ) = percent_control(n_1, mu_1, sd_1, n_2, mu_2, sd_2) + elif data_type == constants.DataType.PERCENT_DIFFERENCE: + row["percent control mean"] = row["endpoint_group-response"] + row["percent control low"] = row["endpoint_group-lower_ci"] + row["percent control high"] = row["endpoint_group-upper_ci"] + elif data_type == constants.DataType.DICHOTOMOUS: + if i_1 and n_1: + i_2 = row["endpoint_group-incidence"] + n_2 = row["endpoint_group-n"] + if n_2: + row["percent control mean"] = ( + ((i_2 / n_2) - (i_1 / n_1)) / (i_1 / n_1) * 100 + ) + return row + + group_df = group_df.apply(__func, axis="columns") + group_df["maximum endpoint change"] = maximum_percent_control_change( + group_df["percent control mean"].dropna() + ) + return group_df + + return ( + df.groupby("endpoint-id", group_keys=False, sort=False) + .apply(_func) + .reset_index(drop=True) + ) + + def handle_animal_description(self, df: pd.DataFrame): + def _func(group_df: pd.DataFrame) -> pd.Series: + gen = group_df["animal_group-generation"].iloc[0] + if len(gen) > 0: + gen += " " + ns_txt = "" + ns = group_df["endpoint_group-n"].dropna().astype(int).tolist() + if len(ns) > 0: + ns_txt = ", N=" + models.EndpointGroup.getNRangeText(ns) + + sex_symbol = group_df["animal_group-sex_symbol"].iloc[0] + if sex_symbol == "NR": + sex_symbol = "sex=NR" + species = group_df["animal_group-species_name"].iloc[0] + strain = group_df["animal_group-strain_name"].iloc[0] + group_df["animal description"] = f"{gen}{species}, {strain} ({sex_symbol})" + group_df[ + "animal description (with N)" + ] = f"{gen}{species}, {strain} ({sex_symbol}{ns_txt})" + + return group_df + + return ( + df.groupby("endpoint-id", group_keys=False, sort=False) + .apply(_func) + .reset_index(drop=True) + ) + + def handle_treatment_period(self, df: pd.DataFrame): + txt = df["experiment-type_display"].str.lower() + txt_index = txt.str.find("(") + txt_updated = ( + txt.to_frame(name="txt") + .join(txt_index.to_frame(name="txt_index")) + .apply( + lambda x: x["txt"] if x["txt_index"] < 0 else x["txt"][: x["txt_index"]], + axis="columns", + result_type="reduce", + ) + ).astype(str) + df["treatment period"] = ( + txt_updated + " (" + df["dosing_regime-duration_exposure_text"] + ).where(df["dosing_regime-duration_exposure_text"].str.len() > 0) + ")" + return df - if units_id is None: - units_id = ser["animal_group"]["dosing_regime"]["doses"][0]["dose_units"]["id"] + def handle_dose_groups(self, df: pd.DataFrame) -> pd.DataFrame: + noel_names = self.kwargs["assessment"].get_noel_names() - return [ - d - for d in ser["animal_group"]["dosing_regime"]["doses"] - if units_id == d["dose_units"]["id"] - ] + def _func(group_df: pd.DataFrame) -> pd.Series: + preferred_units = self.get_preferred_units(group_df) + group_df = group_df[(group_df["dose_group-dose_units_id"] == preferred_units)] + reported_doses = group_df["dose_group-dose"].mask( + pd.isna(group_df["endpoint_group-n"]) + & pd.isna(group_df["endpoint_group-response"]) + & pd.isna(group_df["endpoint_group-incidence"]) + ) + doses = ( + group_df["dose_group-dose"] + if reported_doses.dropna().empty + else reported_doses.dropna() + ) + group_df["doses"] = ( + ", ".join(doses.astype(str)) + " " + group_df["dose_group-dose_units_name"] + ) - @classmethod - def _get_dose_units(cls, doses: list[dict]) -> str: - return doses[0]["dose_units"]["name"] - - @classmethod - def _get_doses_str(cls, doses: list[dict]) -> str: - if len(doses) == 0: - return "" - values = ", ".join([str(float(d["dose"])) for d in doses]) - return f"{values} {cls._get_dose_units(doses)}" - - @classmethod - def _get_dose(cls, doses: list[dict], idx: int) -> float | None: - for dose in doses: - if dose["dose_group_id"] == idx: - return float(dose["dose"]) - return None - - @classmethod - def _get_species_strain(cls, e): - return f"{e['animal_group']['species']} {e['animal_group']['strain']}" - - @classmethod - def _get_observation_time_and_time_units(cls, e): - return f"{e['observation_time']} {e['observation_time_units']}" - - def _get_header_row(self): - # move qs.distinct() call here so we can make qs annotations. - self.queryset = self.queryset.distinct("pk") - if self.queryset.first() is None: - self.rob_headers, self.rob_data = {}, {} - else: - endpoint_ids = set(self.queryset.values_list("id", flat=True)) - self.rob_headers, self.rob_data = FinalRiskOfBiasScore.get_dp_export( - self.queryset.first().assessment_id, + if reported_doses.dropna().empty: + group_df["low_dose"] = None + group_df["high_dose"] = None + group_df[noel_names.noel] = None + group_df[noel_names.loel] = None + group_df["FEL"] = None + return group_df + low_dose_index = reported_doses.iloc[1:].first_valid_index() + group_df["low_dose"] = ( + None if low_dose_index is None else reported_doses.loc[low_dose_index] + ) + high_dose_index = reported_doses.iloc[1:].last_valid_index() + group_df["high_dose"] = ( + None if high_dose_index is None else reported_doses.loc[high_dose_index] + ) + NOEL_series = group_df["dose_group-dose"][ + group_df["endpoint_group-NOEL"].fillna(False) & pd.notna(reported_doses) + ] + group_df[noel_names.noel] = NOEL_series.iloc[0] if NOEL_series.size > 0 else None + LOEL_series = group_df["dose_group-dose"][ + group_df["endpoint_group-LOEL"].fillna(False) & pd.notna(reported_doses) + ] + group_df[noel_names.loel] = LOEL_series.iloc[0] if LOEL_series.size > 0 else None + FEL_series = group_df["dose_group-dose"][ + group_df["endpoint_group-FEL"].fillna(False) & pd.notna(reported_doses) + ] + group_df["FEL"] = FEL_series.iloc[0] if FEL_series.size > 0 else None + return group_df + + return ( + df.groupby("endpoint-id", group_keys=False, sort=False) + .apply(_func) + .reset_index(drop=True) + ) + + def handle_incidence_summary(self, df: pd.DataFrame) -> pd.DataFrame: + def _func(group_df: pd.DataFrame) -> pd.Series: + group_df["dichotomous summary"] = "-" + group_df["percent affected"] = None + group_df["percent lower ci"] = None + group_df["percent upper ci"] = None + data_type = group_df["endpoint-data_type"].iloc[0] + + def __func(row: pd.Series) -> pd.Series: + # logic used from EndpointGroup.get_incidence_summary() + n = row["endpoint_group-n"] + i = row["endpoint_group-incidence"] + if ( + data_type + in [constants.DataType.DICHOTOMOUS, constants.DataType.DICHOTOMOUS_CANCER] + and n is not None + and n > 0 + and i is not None + ): + row["dichotomous summary"] = f"{int(i)}/{int(n)} ({i / n * 100:.1f}%)" + row["percent affected"] = i / n * 100 + row["percent lower ci"] = row["endpoint_group-lower_ci"] * 100 + row["percent upper ci"] = row["endpoint_group-upper_ci"] * 100 + return row + + return group_df.apply(__func, axis="columns") + + return ( + df.groupby("endpoint-id", group_keys=False, sort=False) + .apply(_func) + .reset_index(drop=True) + ) + + def build_df(self) -> pd.DataFrame: + df = EndpointGroupFlatDataPivotExporter().get_df( + self.queryset.select_related( + "animal_group__experiment__study", + "animal_group__dosing_regime", + ) + .prefetch_related("groups", "animal_group__dosing_regime__doses") + .order_by("id", "groups", "animal_group__dosing_regime__doses") + ) + df = df[ + pd.isna(df["dose_group-id"]) + | (df["endpoint_group-dose_group_id"] == df["dose_group-dose_group_id"]) + ] + if df.empty: + return df + if obj := self.queryset.first(): + endpoint_ids = list(df["endpoint-id"].unique()) + rob_headers, rob_data = FinalRiskOfBiasScore.get_dp_export( + obj.assessment_id, endpoint_ids, "animal", ) + rob_df = pd.DataFrame( + data=[ + [rob_data[(endpoint_id, metric_id)] for metric_id in rob_headers.keys()] + for endpoint_id in endpoint_ids + ], + columns=list(rob_headers.values()), + index=endpoint_ids, + ) + df = df.join(rob_df, on="endpoint-id") + + df["route"] = df["dosing_regime-route_of_exposure_display"].str.lower() + df["species strain"] = ( + df["animal_group-species_name"] + " " + df["animal_group-strain_name"] + ) + + df["observation time"] = ( + df["endpoint-observation_time"].replace(np.nan, None).astype(str) + + " " + + df["endpoint-observation_time_units_display"] + ) + + df = self.handle_stdev(df) + df = self.handle_ci(df) + df = self.handle_dose_groups(df) + df = self.handle_animal_description(df) + df = self.handle_treatment_period(df) + df = self.handle_percent_control(df) + df = self.handle_incidence_summary(df) + + df = df.rename( + columns={ + "study-id": "study id", + "study-short_citation": "study name", + "study-study_identifier": "study identifier", + "study-published": "study published", + "experiment-id": "experiment id", + "experiment-name": "experiment name", + "experiment-chemical": "chemical", + "animal_group-id": "animal group id", + "animal_group-name": "animal group name", + "animal_group-lifestage_exposed": "lifestage exposed", + "animal_group-lifestage_assessed": "lifestage assessed", + "animal_group-species_name": "species", + "animal_group-generation": "generation", + "animal_group-sex_display": "sex", + "dosing_regime-duration_exposure_text": "duration exposure", + "dosing_regime-duration_exposure": "duration exposure (days)", + "endpoint-id": "endpoint id", + "endpoint-name": "endpoint name", + "endpoint-system": "system", + "endpoint-organ": "organ", + "endpoint-effect": "effect", + "endpoint-effect_subtype": "effect subtype", + "endpoint-diagnostic": "diagnostic", + "endpoint-effects_display": "tags", + "endpoint-observation_time_text": "observation time text", + "endpoint-data_type_display": "data type", + "dose_group-dose_units_name": "dose units", + "endpoint-response_units": "response units", + "endpoint-expected_adversity_direction": "expected adversity direction", + "endpoint-trend_value": "trend test value", + "endpoint-trend_result_display": "trend test result", + "endpoint_group-id": "key", + "endpoint_group-dose_group_id": "dose index", + "dose_group-dose": "dose", + "endpoint_group-n": "N", + "endpoint_group-incidence": "incidence", + "endpoint_group-response": "response", + "endpoint_group-stdev": "stdev", + "endpoint_group-lower_ci": "lower_ci", + "endpoint_group-upper_ci": "upper_ci", + "endpoint_group-significant": "pairwise significant", + "endpoint_group-significance_level": "pairwise significant value", + "endpoint_group-treatment_effect_display": "treatment related effect", + } + ) + df = df.drop( + columns=[ + "endpoint-observation_time", + "dose_group-id", + "dose_group-dose_group_id", + "experiment-type_display", + "endpoint_group-FEL", + "animal_group-sex_symbol", + "animal_group-strain_name", + "endpoint_group-LOEL", + "endpoint-variance_type", + "dose_group-dose_units_id", + "endpoint_group-NOEL", + "endpoint-observation_time_units_display", + "endpoint_group-variance", + "endpoint-data_type", + "dosing_regime-route_of_exposure_display", + ] + ) - noel_names = self.kwargs["assessment"].get_noel_names() - headers = [ - "study id", - "study name", - "study identifier", - "study published", - "experiment id", - "experiment name", - "chemical", - "animal group id", - "animal group name", - "lifestage exposed", - "lifestage assessed", - "species", - "species strain", - "generation", - "animal description", - "animal description (with N)", - "sex", - "route", - "treatment period", - "duration exposure", - "duration exposure (days)", - "endpoint id", - "endpoint name", - "system", - "organ", - "effect", - "effect subtype", - "diagnostic", - "tags", - "observation time", - "observation time text", - "data type", - "doses", - "dose units", - "response units", - "expected adversity direction", - "maximum endpoint change", - "low_dose", - "high_dose", - noel_names.noel, - noel_names.loel, - "FEL", - "trend test value", - "trend test result", - "key", - "dose index", - "dose", - "N", - "incidence", - "response", - "stdev", - "lower_ci", - "upper_ci", - "pairwise significant", - "pairwise significant value", - "treatment related effect", - "percent control mean", - "percent control low", - "percent control high", - "dichotomous summary", - "percent affected", - "percent lower ci", - "percent upper ci", + return df + + +class EndpointFlatDataPivotExporter(Exporter): + def build_modules(self) -> list[ModelExport]: + return [ + StudyExport( + "study", + "animal_group__experiment__study", + include=("id", "short_citation", "study_identifier", "published"), + ), + ExperimentExport( + "experiment", + "animal_group__experiment", + include=("id", "name", "type_display", "chemical"), + ), + AnimalGroupExport( + "animal_group", + "animal_group", + include=( + "id", + "name", + "lifestage_exposed", + "lifestage_assessed", + "species_name", + "strain_name", + "generation", + "sex_display", + "sex_symbol", + ), + ), + DosingRegimeExport( + "dosing_regime", + "animal_group__dosing_regime", + include=( + "route_of_exposure_display", + "duration_exposure_text", + "duration_exposure", + ), + ), + EndpointExport( + "endpoint", + "", + include=( + "id", + "name", + "system", + "organ", + "effect", + "effect_subtype", + "diagnostic", + "effects_display", + "observation_time", + "observation_time_units_display", + "observation_time_text", + "variance_type", + "data_type", + "data_type_display", + "trend_value", + "trend_result_display", + "expected_adversity_direction", + "response_units", + ), + ), + EndpointGroupExport( + "endpoint_group", + "groups", + include=( + "id", + "dose_group_id", + "n", + "incidence", + "response", + "lower_ci", + "upper_ci", + "significant", + "significance_level", + "treatment_effect_display", + "NOEL", + "LOEL", + "FEL", + "variance", + ), + ), + DoseGroupExport( + "dose_group", + "animal_group__dosing_regime__doses", + include=("id", "dose_units_id", "dose_units_name", "dose_group_id", "dose"), + ), ] - headers.extend(list(self.rob_headers.values())) - return headers - def _get_data_rows(self): +class EndpointFlatDataPivot(EndpointGroupFlatDataPivot): + def handle_bmd(self, df: pd.DataFrame) -> pd.DataFrame: + endpoint_ids = df["endpoint-id"].unique() + sessions = Session.objects.filter(endpoint_id__in=endpoint_ids, active=True) + bmd_map = defaultdict(list) + for session in sessions: + bmd_map[session.endpoint_id].append(session.get_selected_model()) preferred_units = self.kwargs.get("preferred_units", None) + df["BMD"] = None + df["BMDL"] = None + + def _func(row: pd.Series) -> pd.Series: + bmds = bmd_map[row["endpoint-id"]] + for bmd in bmds: + if bmd["dose_units_id"] in preferred_units and bmd["model"] is not None: + row["BMD"] = bmd["bmd"] + row["BMDL"] = bmd["bmdl"] + break + return row - rows = [] - for obj in self.queryset: - ser = obj.get_json(json_encode=False) - doses = self._get_doses_list(ser, preferred_units) - endpoint_robs = [ - self.rob_data[(ser["id"], metric_id)] for metric_id in self.rob_headers.keys() - ] + return df.apply(_func, axis="columns") - # build endpoint-group independent data - row = [ - ser["animal_group"]["experiment"]["study"]["id"], - ser["animal_group"]["experiment"]["study"]["short_citation"], - ser["animal_group"]["experiment"]["study"]["study_identifier"], - ser["animal_group"]["experiment"]["study"]["published"], - ser["animal_group"]["experiment"]["id"], - ser["animal_group"]["experiment"]["name"], - ser["animal_group"]["experiment"]["chemical"], - ser["animal_group"]["id"], - ser["animal_group"]["name"], - ser["animal_group"]["lifestage_exposed"], - ser["animal_group"]["lifestage_assessed"], - ser["animal_group"]["species"], - self._get_species_strain(ser), - ser["animal_group"]["generation"], - get_gen_species_strain_sex(ser, withN=False), - get_gen_species_strain_sex(ser, withN=True), - ser["animal_group"]["sex"], - ser["animal_group"]["dosing_regime"]["route_of_exposure"].lower(), - get_treatment_period( - ser["animal_group"]["experiment"], - ser["animal_group"]["dosing_regime"], - ), - ser["animal_group"]["dosing_regime"]["duration_exposure_text"], - ser["animal_group"]["dosing_regime"]["duration_exposure"], - ser["id"], - ser["name"], - ser["system"], - ser["organ"], - ser["effect"], - ser["effect_subtype"], - ser["diagnostic"], - self.get_flattened_tags(ser, "effects"), - self._get_observation_time_and_time_units(ser), - ser["observation_time_text"], - ser["data_type_label"], - self._get_doses_str(doses), - self._get_dose_units(doses), - ser["response_units"], - ser["expected_adversity_direction"], - ser["percentControlMaxChange"], - ] + def handle_flat_doses(self, df: pd.DataFrame) -> pd.DataFrame: + def _func(group_df: pd.DataFrame) -> pd.Series: + unique_df = group_df.drop_duplicates(subset="endpoint_group-id").reset_index(drop=True) + reported_doses = unique_df["dose_group-dose"].mask( + pd.isna(unique_df["endpoint_group-n"]) + & pd.isna(unique_df["endpoint_group-response"]) + & pd.isna(unique_df["endpoint_group-incidence"]) + ) + num_doses = reported_doses.size - # dose-group specific information - if len(ser["groups"]) > 1: - row.extend( - [ - self._get_dose(doses, 1), # first non-zero dose - self._get_dose(doses, len(ser["groups"]) - 1), - self._get_dose(doses, ser["NOEL"]), - self._get_dose(doses, ser["LOEL"]), - self._get_dose(doses, ser["FEL"]), - ] + group_df[[f"Dose {i}" for i in range(1, num_doses + 1)]] = reported_doses.reset_index( + drop=True + ) + + data_type = unique_df["endpoint-data_type"].iloc[0] + control_group = unique_df.iloc[0] + if pd.isna(unique_df["endpoint_group-id"]).all(): + pass + elif data_type in { + constants.DataType.CONTINUOUS, + constants.DataType.PERCENT_DIFFERENCE, + constants.DataType.DICHOTOMOUS, + constants.DataType.DICHOTOMOUS_CANCER, + }: + if data_type in { + constants.DataType.CONTINUOUS, + constants.DataType.PERCENT_DIFFERENCE, + }: + field = "endpoint_group-response" + elif data_type in { + constants.DataType.DICHOTOMOUS, + constants.DataType.DICHOTOMOUS_CANCER, + }: + field = "percent affected" + control_resp = control_group[field] + insignificant = pd.Series(["No"] * num_doses) + significant = pd.Series(["Yes - ?"] * num_doses) + significant_up = pd.Series(["Yes - ↑"] * num_doses) + significant_down = pd.Series(["Yes - ↓"] * num_doses) + + significance = insignificant.mask( + (unique_df["endpoint_group-significant"].fillna(False)), + significant_down.mask((unique_df[field] > control_resp), significant_up).mask( + ( + (pd.isna(control_resp)) + | (pd.isna(unique_df[field])) + | (unique_df[field] == control_resp) + ), + significant, + ), ) - else: - row.extend([None] * 5) - - row.extend([ser["trend_value"], ser["trend_result"]]) - - # endpoint-group information - for i, eg in enumerate(ser["groups"]): - row_copy = copy(row) - row_copy.extend( - [ - eg["id"], - eg["dose_group_id"], - self._get_dose(doses, i), - eg["n"], - eg["incidence"], - eg["response"], - eg["stdev"], - eg["lower_ci"], - eg["upper_ci"], - eg["significant"], - eg["significance_level"], - eg["treatment_effect"], - eg["percentControlMean"], - eg["percentControlLow"], - eg["percentControlHigh"], - eg["dichotomous_summary"], - eg["percent_affected"], - eg["percent_lower_ci"], - eg["percent_upper_ci"], - ] + group_df[[f"Significant {i}" for i in range(1, num_doses + 1)]] = significance + elif data_type == constants.DataType.NR: + group_df[[f"Significant {i}" for i in range(1, num_doses + 1)]] = pd.Series( + ["?"] * num_doses ) - row_copy.extend(endpoint_robs) - rows.append(row_copy) - return rows + group_df[ + [f"Treatment Related Effect {i}" for i in range(1, num_doses + 1)] + ] = unique_df["endpoint_group-treatment_effect_display"].reset_index(drop=True) + return group_df.drop_duplicates( + subset=group_df.columns[group_df.columns.str.endswith("-id")].difference( + ["endpoint_group-id"] + ) + ) -class EndpointFlatDataPivot(EndpointGroupFlatDataPivot): - def _get_header_row(self): - if self.queryset.first() is None: - self.rob_headers, self.rob_data = {}, {} - else: - endpoint_ids = set(self.queryset.values_list("id", flat=True)) - self.rob_headers, self.rob_data = FinalRiskOfBiasScore.get_dp_export( - self.queryset.first().assessment_id, + return ( + df.groupby("endpoint-id", group_keys=False, sort=False) + .apply(_func) + .reset_index(drop=True) + ) + + def build_df(self) -> pd.DataFrame: + df = EndpointFlatDataPivotExporter().get_df( + self.queryset.select_related( + "animal_group__experiment__study", + "animal_group__dosing_regime", + ) + .prefetch_related("groups", "animal_group__dosing_regime__doses") + .order_by("id", "groups", "animal_group__dosing_regime__doses") + ) + df = df[ + pd.isna(df["endpoint_group-id"]) + | pd.isna(df["dose_group-id"]) + | (df["endpoint_group-dose_group_id"] == df["dose_group-dose_group_id"]) + ] + if df.empty: + return df + if obj := self.queryset.first(): + endpoint_ids = list(df["endpoint-id"].unique()) + rob_headers, rob_data = FinalRiskOfBiasScore.get_dp_export( + obj.assessment_id, endpoint_ids, "animal", ) - - noel_names = self.kwargs["assessment"].get_noel_names() - header = [ - "study id", - "study name", - "study identifier", - "study published", - "experiment id", - "experiment name", - "chemical", - "animal group id", - "animal group name", - "lifestage exposed", - "lifestage assessed", - "species", - "species strain", - "generation", - "animal description", - "animal description (with N)", - "sex", - "route", - "treatment period", - "duration exposure", - "duration exposure (days)", - "endpoint id", - "endpoint name", - "system", - "organ", - "effect", - "effect subtype", - "diagnostic", - "tags", - "observation time", - "observation time text", - "data type", - "doses", - "dose units", - "response units", - "expected adversity direction", - "low_dose", - "high_dose", - noel_names.noel, - noel_names.loel, - "FEL", - "BMD", - "BMDL", - "trend test value", - "trend test result", - ] - - num_doses = self.queryset.model.max_dose_count(self.queryset) - rng = range(1, num_doses + 1) - header.extend([f"Dose {i}" for i in rng]) - header.extend([f"Significant {i}" for i in rng]) - header.extend([f"Treatment Related Effect {i}" for i in rng]) - header.extend(list(self.rob_headers.values())) - - # distinct applied last so that queryset can add annotations above - # in self.queryset.model.max_dose_count - self.queryset = self.queryset.distinct("pk") - self.num_doses = num_doses - - return header - - @staticmethod - def _get_bmd_values(bmds, preferred_units): - # only return BMD values if they're in the preferred units - for bmd in bmds: - # return first match - if bmd["dose_units_id"] in preferred_units and bmd["model"] is not None: - return [bmd["bmd"], bmd["bmdl"]] - return [None, None] - - @staticmethod - def _dose_is_reported(dose_group_id: int, groups: list[dict]) -> bool: - """ - Check if any numerical data( n, response, or incidence) was entered for a dose-group - """ - for group in groups: - if group["dose_group_id"] == dose_group_id: - return any(group.get(key) is not None for key in ["n", "response", "incidence"]) - return False - - @staticmethod - def _dose_low_high(dose_list: list[float | None]) -> tuple[float | None, float | None]: - """ - Finds the lowest and highest non-zero dose from a given list of doses, - ignoring None values. If there are no valid doses, returns None for both - lowest and highest dose. - - Args: - dose_list (list[Optional[float]]): List of doses - - Returns: - tuple[Optional[float], Optional[float]]: Lowest dose and highest dose, - in that order. - """ - try: - # map dose list to whether there is recorded data (valid) - dose_validity_list = list(map(lambda d: d is not None, dose_list)) - # first valid dose - low_index = dose_validity_list[1:].index(True) + 1 - # last valid dose - high_index = len(dose_list) - 1 - dose_validity_list[1:][::-1].index(True) - return (dose_list[low_index], dose_list[high_index]) - except ValueError: - return (None, None) - - def _get_data_rows(self): - preferred_units = self.kwargs.get("preferred_units", None) - - rows = [] - for obj in self.queryset: - ser = obj.get_json(json_encode=False) - doses = self._get_doses_list(ser, preferred_units) - - # filter dose groups by those with recorded data - filtered_doses = list( - filter(lambda d: self._dose_is_reported(d["dose_group_id"], ser["groups"]), doses) + rob_df = pd.DataFrame( + data=[ + [rob_data[(endpoint_id, metric_id)] for metric_id in rob_headers.keys()] + for endpoint_id in endpoint_ids + ], + columns=list(rob_headers.values()), + index=endpoint_ids, ) - # special case - if no data was reported for any dose-group show all doses; - # it may be the case that data wasn't extracted - if len(filtered_doses) == 0: - filtered_doses = doses - - # build endpoint-group independent data - row = [ - ser["animal_group"]["experiment"]["study"]["id"], - ser["animal_group"]["experiment"]["study"]["short_citation"], - ser["animal_group"]["experiment"]["study"]["study_identifier"], - ser["animal_group"]["experiment"]["study"]["published"], - ser["animal_group"]["experiment"]["id"], - ser["animal_group"]["experiment"]["name"], - ser["animal_group"]["experiment"]["chemical"], - ser["animal_group"]["id"], - ser["animal_group"]["name"], - ser["animal_group"]["lifestage_exposed"], - ser["animal_group"]["lifestage_assessed"], - ser["animal_group"]["species"], - self._get_species_strain(ser), - ser["animal_group"]["generation"], - get_gen_species_strain_sex(ser, withN=False), - get_gen_species_strain_sex(ser, withN=True), - ser["animal_group"]["sex"], - ser["animal_group"]["dosing_regime"]["route_of_exposure"].lower(), - get_treatment_period( - ser["animal_group"]["experiment"], - ser["animal_group"]["dosing_regime"], - ), - ser["animal_group"]["dosing_regime"]["duration_exposure_text"], - ser["animal_group"]["dosing_regime"]["duration_exposure"], - ser["id"], - ser["name"], - ser["system"], - ser["organ"], - ser["effect"], - ser["effect_subtype"], - ser["diagnostic"], - self.get_flattened_tags(ser, "effects"), - self._get_observation_time_and_time_units(ser), - ser["observation_time_text"], - ser["data_type_label"], - self._get_doses_str(filtered_doses), - self._get_dose_units(doses), - ser["response_units"], - ser["expected_adversity_direction"], + df = df.join(rob_df, on="endpoint-id") + + df["route"] = df["dosing_regime-route_of_exposure_display"].str.lower() + df["species strain"] = ( + df["animal_group-species_name"] + " " + df["animal_group-strain_name"] + ) + + df["observation time"] = ( + df["endpoint-observation_time"].replace(np.nan, None).astype(str) + + " " + + df["endpoint-observation_time_units_display"] + ) + + df = self.handle_stdev(df) + df = self.handle_ci(df) + df = self.handle_incidence_summary(df) + df = self.handle_dose_groups(df) + df = self.handle_flat_doses(df) + df = self.handle_animal_description(df) + df = self.handle_treatment_period(df) + df = self.handle_bmd(df) + + df = df.drop_duplicates(subset="endpoint-id") + + df = df.rename( + columns={ + "study-id": "study id", + "study-short_citation": "study name", + "study-study_identifier": "study identifier", + "study-published": "study published", + "experiment-id": "experiment id", + "experiment-name": "experiment name", + "experiment-chemical": "chemical", + "animal_group-id": "animal group id", + "animal_group-name": "animal group name", + "animal_group-lifestage_exposed": "lifestage exposed", + "animal_group-lifestage_assessed": "lifestage assessed", + "animal_group-species_name": "species", + "animal_group-generation": "generation", + "animal_group-sex_display": "sex", + "dosing_regime-duration_exposure_text": "duration exposure", + "dosing_regime-duration_exposure": "duration exposure (days)", + "endpoint-id": "endpoint id", + "endpoint-name": "endpoint name", + "endpoint-system": "system", + "endpoint-organ": "organ", + "endpoint-effect": "effect", + "endpoint-effect_subtype": "effect subtype", + "endpoint-diagnostic": "diagnostic", + "endpoint-effects_display": "tags", + "endpoint-observation_time_text": "observation time text", + "endpoint-data_type_display": "data type", + "dose_group-dose_units_name": "dose units", + "endpoint-response_units": "response units", + "endpoint-expected_adversity_direction": "expected adversity direction", + "endpoint-trend_value": "trend test value", + "endpoint-trend_result_display": "trend test result", + } + ) + df = df.drop( + columns=[ + "endpoint_group-stdev", + "percent lower ci", + "percent affected", + "percent upper ci", + "dichotomous summary", + "endpoint-variance_type", + "dose_group-dose", + "endpoint-data_type", + "endpoint_group-NOEL", + "endpoint_group-incidence", + "endpoint_group-FEL", + "endpoint_group-treatment_effect_display", + "dose_group-dose_units_id", + "endpoint_group-LOEL", + "dose_group-dose_group_id", + "endpoint_group-variance", + "animal_group-sex_symbol", + "endpoint_group-upper_ci", + "endpoint_group-significance_level", + "endpoint_group-response", + "endpoint_group-lower_ci", + "endpoint_group-significant", + "dose_group-id", + "experiment-type_display", + "endpoint_group-dose_group_id", + "endpoint-observation_time_units_display", + "endpoint_group-n", + "dosing_regime-route_of_exposure_display", + "endpoint_group-id", + "animal_group-strain_name", + "endpoint-observation_time", ] + ) - # if groups exist, pull all available. Otherwise, start with an empty list. This - # is preferred than just pulling in edge cases where an endpoint has no data - # extracted but has more dose-groups at the animal group level than are avaiable - # for the entire data export. For example, an endpoint may have no data extracted - # and dose-groups, but the entire export may only have data with 4 dose-groups. - dose_list = ( - [ - self._get_dose(doses, i) if self._dose_is_reported(i, ser["groups"]) else None - for i in range(len(doses)) - ] - if ser["groups"] - else [] - ) - - # dose-group specific information - row.extend(self._dose_low_high(dose_list)) - try: - row.append(dose_list[ser["NOEL"]]) - except IndexError: - row.append(None) - try: - row.append(dose_list[ser["LOEL"]]) - except IndexError: - row.append(None) - try: - row.append(dose_list[ser["FEL"]]) - except IndexError: - row.append(None) - - dose_list.extend([None] * (self.num_doses - len(dose_list))) - - # bmd/bmdl information - row.extend(self._get_bmd_values(ser["bmds"], preferred_units)) + return df - row.extend([ser["trend_value"], ser["trend_result"]]) - row.extend(dose_list) +class EndpointSummaryExporter(Exporter): + def build_modules(self) -> list[ModelExport]: + return [ + StudyExport( + "study", + "animal_group__experiment__study", + include=("short_citation", "study_identifier"), + ), + ExperimentExport( + "experiment", "animal_group__experiment", include=("chemical", "type_display") + ), + AnimalGroupExport( + "animal_group", + "animal_group", + include=( + "name", + "species_name", + "strain_name", + "generation", + "sex_display", + "sex_symbol", + ), + ), + DosingRegimeExport( + "dosing_regime", + "animal_group__dosing_regime", + include=("route_of_exposure_display", "duration_exposure_text"), + ), + EndpointExport( + "endpoint", + "", + include=( + "id", + "url", + "name", + "system", + "organ", + "effect", + "observation_time_text", + "response_units", + "data_type", + ), + ), + EndpointGroupExport( + "endpoint_group", + "groups", + include=( + "id", + "dose_group_id", + "n", + "incidence", + "response", + "variance", + "significant", + ), + ), + DoseGroupExport( + "dose_group", + "animal_group__dosing_regime__doses", + include=("dose_units_name", "dose_group_id", "dose"), + ), + ] - sigs = get_significance_and_direction(ser["data_type"], ser["groups"]) - sigs.extend([None] * (self.num_doses - len(sigs))) - row.extend(sigs) - tres = [dose["treatment_effect"] for dose in ser["groups"]] - tres.extend([None] * (self.num_doses - len(tres))) - row.extend(tres) +class EndpointSummary(EndpointGroupFlatDataPivot): + def _set_responses(self, df: pd.DataFrame): + df["responses"] = None - row.extend( - [self.rob_data[(ser["id"], metric_id)] for metric_id in self.rob_headers.keys()] + def _func(group_df: pd.DataFrame) -> pd.Series: + unique_df = group_df.drop_duplicates(subset="endpoint_group-id") + response_series = ( + unique_df["endpoint_group-response"] + .map("{:g}".format, na_action="ignore") + .fillna("") + ) + incidence_series = ( + unique_df["endpoint_group-incidence"] + .map("{:g}".format, na_action="ignore") + .fillna("") + ) + variance_series = ( + unique_df["endpoint_group-variance"] + .map("{:g}".format, na_action="ignore") + .fillna("") + ) + response_or_incidence = incidence_series.mask( + response_series.str.len() > 0, response_series + ) + response_or_incidence_with_variance = response_or_incidence.mask( + (response_or_incidence.str.len() > 0) & (variance_series.str.len() > 0), + response_or_incidence + " ± " + variance_series, + ) + group_df["responses"] = [ + response_or_incidence_with_variance.reset_index(drop=True) + ] * group_df.shape[0] + + return group_df + + return ( + df.groupby("endpoint-id", group_keys=False, sort=False) + .apply(_func) + .reset_index(drop=True) + ) + + def _set_ns(self, df: pd.DataFrame): + df["ns"] = None + + def _func(group_df: pd.DataFrame) -> pd.Series: + unique_df = group_df.drop_duplicates(subset="endpoint_group-id") + group_df["ns"] = [ + unique_df["endpoint_group-n"] + .map("{:g}".format, na_action="ignore") + .fillna("") + .reset_index(drop=True) + ] * group_df.shape[0] + + return group_df + + return ( + df.groupby("endpoint-id", group_keys=False, sort=False) + .apply(_func) + .reset_index(drop=True) + ) + + def _set_response_direction(self, df: pd.DataFrame): + df["response_direction"] = None + + def _func(group_df: pd.DataFrame) -> pd.Series: + data_type = group_df["endpoint-data_type"].iloc[0] + control_group = group_df.iloc[0] + if pd.notna(control_group["endpoint_group-id"]) and pd.isna( + control_group["endpoint_group-response"] + ): + group_df["response_direction"] = "?" + return group_df + significant_groups = group_df[group_df["endpoint_group-significant"].fillna(False)] + if significant_groups.empty: + group_df["response_direction"] = "↔" + return group_df + significant_group = significant_groups.iloc[0] + if data_type in [constants.DataType.CONTINUOUS, constants.DataType.PERCENT_DIFFERENCE]: + if ( + significant_group["endpoint_group-response"] + > control_group["endpoint_group-response"] + ): + group_df["response_direction"] = "↑" + else: + group_df["response_direction"] = "↓" + return group_df + else: + group_df["response_direction"] = "↑" + return group_df + + return ( + df.groupby("endpoint-id", group_keys=False, sort=False) + .apply(_func) + .reset_index(drop=True) + ) + + def _set_doses(self, df: pd.DataFrame): + df["doses"] = None + + def _func(group_df: pd.DataFrame) -> pd.Series: + def __func(_group_df: pd.DataFrame) -> pd.Series: + _group_df["doses"] = [ + _group_df["dose_group-dose"] + .map("{:g}".format, na_action="ignore") + .fillna("") + .reset_index(drop=True) + ] * _group_df.shape[0] + return _group_df + + return ( + group_df.groupby("dose_group-dose_units_name", group_keys=False, sort=False) + .apply(__func) + .reset_index(drop=True) ) - rows.append(row) - - return rows - + return ( + df.groupby("endpoint-id", group_keys=False, sort=False) + .apply(_func) + .reset_index(drop=True) + ) + + def handle_other(self, df: pd.DataFrame) -> pd.DataFrame: + def _func(series: pd.Series): + units = series["dose_group-dose_units_name"] + doses, responses = series["doses"], series["responses"] + doses, responses = ( + doses.iloc[: min(doses.size, responses.size)], + responses.iloc[: min(doses.size, responses.size)], + ) + valid = responses.str.len() > 0 + return ", ".join(doses[valid] + " " + units + ": " + responses[valid]) + + df = self._set_responses(df) + df = self._set_ns(df) + df = self._set_response_direction(df) + df = self._set_doses(df) + + df["Dose units"] = df["dose_group-dose_units_name"] + df["Doses"] = df["doses"].str.join(", ") + df["N"] = df["ns"].str.join(", ") + df["Responses"] = df["responses"].str.join(", ") + df["Doses and responses"] = df.apply(_func, axis="columns", result_type="reduce") + df["Response direction"] = df["response_direction"] + + return df + + def handle_treatment_period(self, df: pd.DataFrame): + txt = df["experiment-type_display"].str.lower() + txt_index = txt.str.find("(") + txt_updated = ( + txt.to_frame(name="txt") + .join(txt_index.to_frame(name="txt_index")) + .apply( + lambda x: x["txt"] if x["txt_index"] < 0 else x["txt"][: x["txt_index"]], + axis="columns", + result_type="reduce", + ) + ).astype(str) + df["dosing_regime-duration_exposure_text"] = ( + txt_updated + " (" + df["dosing_regime-duration_exposure_text"] + ).where(df["dosing_regime-duration_exposure_text"].str.len() > 0) + ")" + return df + + def build_df(self) -> pd.DataFrame: + df = EndpointSummaryExporter().get_df( + self.queryset.select_related( + "animal_group__experiment__study", + "animal_group__dosing_regime", + ) + .prefetch_related("groups", "animal_group__dosing_regime__doses") + .order_by("id", "groups", "animal_group__dosing_regime__doses") + ) -class EndpointSummary(FlatFileExporter): - def _get_header_row(self): - return [ - "study-short_citation", - "study-study_identifier", - "experiment-chemical", - "animal_group-name", - "animal_group-sex", - "animal description (with n)", - "dosing_regime-route_of_exposure", - "dosing_regime-duration_exposure_text", - "species-name", - "strain-name", - "endpoint-id", - "endpoint-url", - "endpoint-system", - "endpoint-organ", - "endpoint-effect", - "endpoint-name", - "endpoint-observation_time", - "endpoint-response_units", - "Dose units", - "Doses", - "N", - "Responses", - "Doses and responses", - "Response direction", + df = df[ + (pd.isna(df["endpoint_group-id"])) + | (df["endpoint_group-dose_group_id"] == df["dose_group-dose_group_id"]) ] - - def _get_data_rows(self): - def getDoseUnits(doses): - return set(sorted([d["dose_units"]["name"] for d in doses])) - - def getDoses(doses, unit): - doses = [d["dose"] for d in doses if d["dose_units"]["name"] == unit] - return [f"{d:g}" for d in doses] - - def getNs(groups): - return [f"{grp['n'] if grp['n'] is not None else ''}" for grp in groups] - - def getResponses(groups): - resps = [] - for grp in groups: - txt = "" - if grp["isReported"]: - if grp["response"] is not None: - txt = f"{grp['response']:g}" - else: - txt = f"{grp['incidence']:g}" - if grp["variance"] is not None: - txt = f"{txt} ± {grp['variance']:g}" - resps.append(txt) - return resps - - def getDR(doses, responses, units): - txts = [] - for i in range(len(doses)): - if len(responses) > i and len(responses[i]) > 0: - txt = f"{doses[i]} {units}: {responses[i]}" - txts.append(txt) - return ", ".join(txts) - - def getResponseDirection(responses, data_type): - # return unknown if control response is null - if responses and responses[0]["response"] is None: - return "?" - - txt = "↔" - for resp in responses: - if resp["significant"]: - if data_type in ["C", "P"]: - if resp["response"] > responses[0]["response"]: - txt = "↑" - else: - txt = "↓" - else: - txt = "↑" - break - return txt - - rows = [] - for obj in self.queryset: - ser = obj.get_json(json_encode=False) - - doses = ser["animal_group"]["dosing_regime"]["doses"] - units = getDoseUnits(doses) - - # build endpoint-group independent data - row = [ - ser["animal_group"]["experiment"]["study"]["short_citation"], - ser["animal_group"]["experiment"]["study"]["study_identifier"], - ser["animal_group"]["experiment"]["chemical"], - ser["animal_group"]["name"], - ser["animal_group"]["sex"], - get_gen_species_strain_sex(ser, withN=True), - ser["animal_group"]["dosing_regime"]["route_of_exposure"], - get_treatment_period( - ser["animal_group"]["experiment"], - ser["animal_group"]["dosing_regime"], - ), - ser["animal_group"]["species"], - ser["animal_group"]["strain"], - ser["id"], - ser["url"], - ser["system"], - ser["organ"], - ser["effect"], - ser["name"], - ser["observation_time_text"], - ser["response_units"], + if df.empty: + return df + df = self.handle_animal_description(df) + df = self.handle_treatment_period(df) + df = self.handle_other(df) + + df = df.drop_duplicates(subset=["endpoint-id", "dose_group-dose_units_name"]) + df = df.sort_values(by=["endpoint-id", "dose_group-dose_units_name"]) + + df = df.rename( + columns={ + "animal description (with N)": "animal description (with n)", + "animal_group-sex_display": "animal_group-sex", + "dosing_regime-route_of_exposure_display": "dosing_regime-route_of_exposure", + "animal_group-species_name": "species-name", + "animal_group-strain_name": "strain-name", + "endpoint-observation_time_text": "endpoint-observation_time", + } + ) + + df = df.drop( + columns=[ + "doses", + "experiment-type_display", + "animal description", + "endpoint_group-incidence", + "animal_group-sex_symbol", + "dose_group-dose_units_name", + "dose_group-dose_group_id", + "endpoint-data_type", + "response_direction", + "responses", + "endpoint_group-significant", + "endpoint_group-n", + "dose_group-dose", + "endpoint_group-response", + "endpoint_group-id", + "animal_group-generation", + "endpoint_group-dose_group_id", + "ns", + "endpoint_group-variance", ] + ) - responses_list = getResponses(ser["groups"]) - ns_list = getNs(ser["groups"]) - response_direction = getResponseDirection(ser["groups"], ser["data_type"]) - for unit in units: - row_copy = copy(row) - doses_list = getDoses(doses, unit) - row_copy.extend( - [ - unit, # 'units' - ", ".join(doses_list), # Doses - ", ".join(ns_list), # Ns - ", ".join(responses_list), # Responses w/ units - getDR(doses_list, responses_list, unit), - response_direction, - ] - ) - rows.append(row_copy) - - return rows + return df diff --git a/hawc/apps/animal/models.py b/hawc/apps/animal/models.py index 91fef2ac4a..bca8b541f4 100644 --- a/hawc/apps/animal/models.py +++ b/hawc/apps/animal/models.py @@ -17,7 +17,6 @@ from ..common.helper import ( HAWCDjangoJSONEncoder, SerializerHelper, - cleanHTML, df_move_column, tryParseInt, ) @@ -138,46 +137,6 @@ def is_generational(self): def get_assessment(self): return self.study.get_assessment() - @staticmethod - def flat_complete_header_row(): - return ( - "experiment-id", - "experiment-url", - "experiment-name", - "experiment-type", - "experiment-has_multiple_generations", - "experiment-chemical", - "experiment-cas", - "experiment-dtxsid", - "experiment-chemical_source", - "experiment-purity_available", - "experiment-purity_qualifier", - "experiment-purity", - "experiment-vehicle", - "experiment-guideline_compliance", - "experiment-description", - ) - - @staticmethod - def flat_complete_data_row(ser): - return ( - ser["id"], - ser["url"], - ser["name"], - ser["type"], - ser["has_multiple_generations"], - ser["chemical"], - ser["cas"], - ser["dtxsid"], - ser["chemical_source"], - ser["purity_available"], - ser["purity_qualifier"], - ser["purity"], - ser["vehicle"], - ser["guideline_compliance"], - cleanHTML(ser["description"]), - ) - @classmethod def delete_caches(cls, ids): Endpoint.delete_caches( @@ -313,48 +272,10 @@ def generation_short(self): def get_generation_short(cls, value) -> str: return "Other" if value == "Ot" else value - @staticmethod - def flat_complete_header_row(): - return ( - "animal_group-id", - "animal_group-url", - "animal_group-name", - "animal_group-sex", - "animal_group-animal_source", - "animal_group-lifestage_exposed", - "animal_group-lifestage_assessed", - "animal_group-siblings", - "animal_group-parents", - "animal_group-generation", - "animal_group-comments", - "animal_group-diet", - "species-name", - "strain-name", - ) - @classmethod def get_relation_id(cls, rel): return str(rel["id"]) if rel else None - @classmethod - def flat_complete_data_row(cls, ser): - return ( - ser["id"], - ser["url"], - ser["name"], - ser["sex"], - ser["animal_source"], - ser["lifestage_exposed"], - ser["lifestage_assessed"], - cls.get_relation_id(ser["siblings"]), - "|".join([cls.get_relation_id(p) for p in ser["parents"]]), - ser["generation"], - cleanHTML(ser["comments"]), - ser["diet"], - ser["species"], - ser["strain"], - ) - @classmethod def delete_caches(cls, ids): Endpoint.delete_caches( @@ -471,40 +392,6 @@ def dose_groups(self): def isAnimalsDosed(self, animal_group): return self.dosed_animals == animal_group - @staticmethod - def flat_complete_header_row(): - return ( - "dosing_regime-id", - "dosing_regime-dosed_animals", - "dosing_regime-route_of_exposure", - "dosing_regime-duration_exposure", - "dosing_regime-duration_exposure_text", - "dosing_regime-duration_observation", - "dosing_regime-num_dose_groups", - "dosing_regime-positive_control", - "dosing_regime-negative_control", - "dosing_regime-description", - ) - - @staticmethod - def flat_complete_data_row(ser): - return ( - ( - ser["id"], - AnimalGroup.get_relation_id(ser["dosed_animals"]), - ser["route_of_exposure"], - ser["duration_exposure"], - ser["duration_exposure_text"], - ser["duration_observation"], - ser["num_dose_groups"], - ser["positive_control"], - ser["negative_control"], - cleanHTML(ser["description"]), - ) - if ser - else (None for _ in range(10)) - ) - def can_delete(self) -> bool: # can delete only if no animals others than those dosed are related return self.animalgroup_set.exclude(id=self.dosed_animals_id).count() == 0 @@ -542,21 +429,6 @@ class Meta: def __str__(self): return f"{self.dose} {self.dose_units}" - @staticmethod - def flat_complete_data_row(ser_full, units, idx): - cols = [] - ser = [v for v in ser_full if v["dose_group_id"] == idx] - for unit in units: - v = None - for s in ser: - if s["dose_units"]["name"] == unit: - v = s["dose"] - break - - cols.append(v) - - return cols - class Endpoint(BaseEndpoint): objects = managers.EndpointManager() @@ -1040,88 +912,6 @@ def dataset_increasing(self): change += resps[i] - resps[0] return change >= 0 - @staticmethod - def flat_complete_header_row(): - return ( - "endpoint-id", - "endpoint-url", - "endpoint-name", - "endpoint-effects", - "endpoint-system", - "endpoint-organ", - "endpoint-effect", - "endpoint-effect_subtype", - "endpoint-name_term_id", - "endpoint-system_term_id", - "endpoint-organ_term_id", - "endpoint-effect_term_id", - "endpoint-effect_subtype_term_id", - "endpoint-litter_effects", - "endpoint-litter_effect_notes", - "endpoint-observation_time", - "endpoint-observation_time_units", - "endpoint-observation_time_text", - "endpoint-data_location", - "endpoint-response_units", - "endpoint-data_type", - "endpoint-variance_type", - "endpoint-confidence_interval", - "endpoint-data_reported", - "endpoint-data_extracted", - "endpoint-values_estimated", - "endpoint-expected_adversity_direction", - "endpoint-monotonicity", - "endpoint-statistical_test", - "endpoint-trend_value", - "endpoint-trend_result", - "endpoint-diagnostic", - "endpoint-power_notes", - "endpoint-results_notes", - "endpoint-endpoint_notes", - "endpoint-additional_fields", - ) - - @staticmethod - def flat_complete_data_row(ser): - return ( - ser["id"], - ser["url"], - ser["name"], - "|".join([d["name"] for d in ser["effects"]]), - ser["system"], - ser["organ"], - ser["effect"], - ser["effect_subtype"], - ser["name_term"], - ser["system_term"], - ser["organ_term"], - ser["effect_term"], - ser["effect_subtype_term"], - ser["litter_effects"], - ser["litter_effect_notes"], - ser["observation_time"], - ser["observation_time_units"], - ser["observation_time_text"], - ser["data_location"], - ser["response_units"], - ser["data_type"], - ser["variance_name"], - ser["confidence_interval"], - ser["data_reported"], - ser["data_extracted"], - ser["values_estimated"], - ser["expected_adversity_direction_text"], - ser["monotonicity"], - ser["statistical_test"], - ser["trend_value"], - ser["trend_result"], - ser["diagnostic"], - ser["power_notes"], - cleanHTML(ser["results_notes"]), - cleanHTML(ser["endpoint_notes"]), - json.dumps(ser["additional_fields"]), - ) - @staticmethod def setMaximumPercentControlChange(ep): """ @@ -1391,44 +1181,6 @@ def getNRangeText(ns): else: return f"{nmin}-{nmax}" - @staticmethod - def flat_complete_header_row(): - return ( - "endpoint_group-id", - "endpoint_group-dose_group_id", - "endpoint_group-n", - "endpoint_group-incidence", - "endpoint_group-response", - "endpoint_group-variance", - "endpoint_group-lower_ci", - "endpoint_group-upper_ci", - "endpoint_group-significant", - "endpoint_group-significance_level", - "endpoint_group-treatment_effect", - "endpoint_group-NOEL", - "endpoint_group-LOEL", - "endpoint_group-FEL", - ) - - @staticmethod - def flat_complete_data_row(ser, endpoint): - return ( - ser["id"], - ser["dose_group_id"], - ser["n"], - ser["incidence"], - ser["response"], - ser["variance"], - ser["lower_ci"], - ser["upper_ci"], - ser["significant"], - ser["significance_level"], - ser["treatment_effect"], - ser["dose_group_id"] == endpoint["NOEL"], - ser["dose_group_id"] == endpoint["LOEL"], - ser["dose_group_id"] == endpoint["FEL"], - ) - reversion.register(Experiment) reversion.register(AnimalGroup) diff --git a/hawc/apps/common/exports.py b/hawc/apps/common/exports.py index 94205d5774..0ae15eda61 100644 --- a/hawc/apps/common/exports.py +++ b/hawc/apps/common/exports.py @@ -151,8 +151,8 @@ def format_time(self, df: pd.DataFrame) -> pd.DataFrame: return df tz = timezone.get_default_timezone() for key in [self.get_column_name("created"), self.get_column_name("last_updated")]: - if key in df.columns: - df.loc[:, key] = df[key].dt.tz_convert(tz).dt.strftime("%Y-%m-%dT%H:%M:%S.%f%z") + if key in df.columns and not df[key].isnull().all(): + df[key] = df[key].dt.tz_convert(tz).dt.strftime("%Y-%m-%dT%H:%M:%S.%f%z") return df def get_df(self, qs: QuerySet) -> pd.DataFrame: diff --git a/hawc/apps/common/models.py b/hawc/apps/common/models.py index 6d2e281efc..891ae364e3 100644 --- a/hawc/apps/common/models.py +++ b/hawc/apps/common/models.py @@ -518,19 +518,21 @@ def include_related( return queryset | queryset.model.objects.filter(filters) -def sql_display(name: str, Choice: type[Choices]) -> Case: - """Create a annotation to return the display name via SQL +def sql_display(name: str, choice: type[Choices] | dict, default="?") -> Case: + """Create an annotation to return the display name via SQL Args: name (str): the field name - Choice (type[Choices]): a choice field + choice (type[Choices]): a choice field or dict of choices + default: default value if display value is not found Returns: Case: the case statement for use in an annotation """ + choices = choice.items() if isinstance(choice, dict) else choice.choices return Case( - *(When(**{name: key, "then": Value(value)}) for key, value in Choice.choices), - default=Value("?"), + *(When(**{name: key, "then": Value(value)}) for key, value in choices), + default=Value(default), ) diff --git a/hawc/apps/study/models.py b/hawc/apps/study/models.py index 83d303bb73..b0462ee33f 100644 --- a/hawc/apps/study/models.py +++ b/hawc/apps/study/models.py @@ -10,7 +10,7 @@ from reversion import revisions as reversion from ..assessment.models import Communication -from ..common.helper import SerializerHelper, cleanHTML +from ..common.helper import SerializerHelper from ..lit.models import Reference from . import constants, managers @@ -197,65 +197,6 @@ def get_study_types(self) -> list[str]: types.append(field) return types - @staticmethod - def flat_complete_header_row(): - return ( - "study-id", - "study-hero_id", - "study-pubmed_id", - "study-doi", - "study-url", - "study-short_citation", - "study-full_citation", - "study-coi_reported", - "study-coi_details", - "study-funding_source", - "study-bioassay", - "study-epi", - "study-epi_meta", - "study-in_vitro", - "study-eco", - "study-study_identifier", - "study-contact_author", - "study-ask_author", - "study-summary", - "study-editable", - "study-published", - ) - - @staticmethod - def flat_complete_data_row(ser, identifiers_df: pd.DataFrame | None = None) -> tuple: - try: - ident_row = ( - identifiers_df.loc[ser["id"]] if isinstance(identifiers_df, pd.DataFrame) else None - ) - except KeyError: - ident_row = None - return ( - ser["id"], - # IDs can come from identifiers data frame if exists, else check study serializer - ident_row.hero_id if ident_row is not None else ser.get("hero_id", None), - ident_row.pubmed_id if ident_row is not None else ser.get("pubmed_id", None), - ident_row.doi if ident_row is not None else ser.get("doi", None), - ser["url"], - ser["short_citation"], - ser["full_citation"], - ser["coi_reported"], - ser["coi_details"], - ser["funding_source"], - ser["bioassay"], - ser["epi"], - ser["epi_meta"], - ser["in_vitro"], - ser["eco"], - ser["study_identifier"], - ser["contact_author"], - ser["ask_author"], - cleanHTML(ser["summary"]), - ser["editable"], - ser["published"], - ) - @classmethod def identifiers_df(cls, qs: models.QuerySet, relation: str) -> pd.DataFrame: """Returns a data frame with reference identifiers for each study in the QuerySet diff --git a/tests/data/api/api-dp-data-animal-bioassay-endpoint.json b/tests/data/api/api-dp-data-animal-bioassay-endpoint.json index 21bd0caf45..e0a9949615 100644 --- a/tests/data/api/api-dp-data-animal-bioassay-endpoint.json +++ b/tests/data/api/api-dp-data-animal-bioassay-endpoint.json @@ -58,7 +58,7 @@ "study name": "Biesemeier JA et al. 2011", "study published": true, "system": "developmental", - "tags": "|tag1|", + "tags": "tag1", "treatment period": "developmental (3 days)", "trend test result": "significant", "trend test value": 0.04 @@ -122,7 +122,7 @@ "study name": "Biesemeier JA et al. 2011", "study published": true, "system": "", - "tags": "||", + "tags": "", "treatment period": "developmental (3 days)", "trend test result": "not reported", "trend test value": null @@ -186,7 +186,7 @@ "study name": "Biesemeier JA et al. 2011", "study published": true, "system": "Cardiovascular", - "tags": "||", + "tags": "", "treatment period": "developmental (3 days)", "trend test result": "not reported", "trend test value": null @@ -250,7 +250,7 @@ "study name": "Biesemeier JA et al. 2011", "study published": true, "system": "", - "tags": "||", + "tags": "", "treatment period": "developmental (3 days)", "trend test result": "not reported", "trend test value": null @@ -314,7 +314,7 @@ "study name": "Biesemeier JA et al. 2011", "study published": true, "system": "", - "tags": "||", + "tags": "", "treatment period": "developmental (3 days)", "trend test result": "not reported", "trend test value": null diff --git a/tests/hawc/apps/animal/test_exports.py b/tests/hawc/apps/animal/test_exports.py deleted file mode 100644 index 4b1f617bbe..0000000000 --- a/tests/hawc/apps/animal/test_exports.py +++ /dev/null @@ -1,107 +0,0 @@ -from hawc.apps.animal.constants import DataType -from hawc.apps.animal.exports import EndpointFlatDataPivot, get_significance_and_direction - - -def test_get_significance_and_direction(): - # no data - resp = get_significance_and_direction( - DataType.CONTINUOUS, - [], - ) - assert resp == [] - - # continuous - resp = get_significance_and_direction( - DataType.CONTINUOUS, - [ - dict(significant=False, response=0), - dict(significant=False, response=1), - dict(significant=True, response=0), - dict(significant=True, response=-1), - dict(significant=True, response=1), - ], - ) - assert resp == ["No", "No", "Yes - ?", "Yes - ↓", "Yes - ↑"] - - # dichotomous - resp = get_significance_and_direction( - DataType.DICHOTOMOUS, - [ - dict(percent_affected=0, significant=False), - dict(percent_affected=10, significant=False), - dict(percent_affected=20, significant=True), - ], - ) - assert resp == ["No", "No", "Yes - ↑"] - - resp = get_significance_and_direction( - DataType.DICHOTOMOUS_CANCER, - [ - dict(percent_affected=50, significant=False), - dict(percent_affected=40, significant=False), - dict(percent_affected=30, significant=True), - ], - ) - assert resp == ["No", "No", "Yes - ↓"] - - # percent diff - resp = get_significance_and_direction( - DataType.CONTINUOUS, - [ - dict(significant=False, response=0), - dict(significant=False, response=0), - dict(significant=True, response=0), - dict(significant=True, response=-1), - dict(significant=True, response=1), - ], - ) - assert resp == ["No", "No", "Yes - ?", "Yes - ↓", "Yes - ↑"] - - -class TestEndpointFlatDataPivot: - def test_dose_low_high(self): - # returns a tuple of the lowest non-zero dose - # and the highest dose - func = EndpointFlatDataPivot._dose_low_high - - # all of these doses are present - valid_doses = [0.0, 1.0, 20.0, 300.0] - (low, high) = func(valid_doses) - assert low == 1.0 and high == 300.0 - - # if a dose is not present, it will be None - one_invalid_dose = [0.0, 1.0, None, 300.0] - (low, high) = func(one_invalid_dose) - assert low == 1.0 and high == 300.0 - - # missing doses can affect lowest dose - invalid_low_dose = [0.0, None, 20.0, 300.0] - (low, high) = func(invalid_low_dose) - assert low == 20.0 and high == 300.0 - - # missing doses can affect highest dose - invalid_high_dose = [0.0, 1.0, 20.0, None] - (low, high) = func(invalid_high_dose) - assert low == 1.0 and high == 20.0 - - # if only one valid dose, it will be both lowest and highest - one_valid_dose = [0.0, None, 20.0, None] - (low, high) = func(one_valid_dose) - assert low == 20.0 and high == 20.0 - - # if no valid dose, lowest and highest is None - invalid_doses = [0.0, None, None, None] - (low, high) = func(invalid_doses) - assert low is None and high is None - - def test_dose_is_reported(self): - func = EndpointFlatDataPivot._dose_is_reported - - # check that dose is reported even when value is falsy but not None - assert func(1, [dict(dose_group_id=1, n=0)]) is True - assert func(1, [dict(dose_group_id=1, response=0)]) is True - assert func(1, [dict(dose_group_id=1, incidence=0)]) is True - - assert func(1, []) is False - assert func(1, [dict(dose_group_id=1)]) is False - assert func(1, [dict(dose_group_id=1, n=None, response=None, incidence=None)]) is False