|
1 | 1 | from pysnptools.snpreader import Pheno
|
2 | 2 | import numpy as np
|
| 3 | +import pandas as pd |
3 | 4 | from snipar.gtarray import gtarray
|
4 | 5 | from snipar.utilities import make_id_dict
|
5 |
| - |
6 |
| -def read_phenotype(phenofile, missing_char = 'NA', phen_index = 1): |
7 |
| - """Read a phenotype file and remove missing values. |
8 |
| -
|
9 |
| - Args: |
10 |
| - phenofile : :class:`str` |
11 |
| - path to plain text phenotype file with columns FID, IID, phenotype1, phenotype2, ... |
12 |
| - missing_char : :class:`str` |
13 |
| - The character that denotes a missing phenotype value; 'NA' by default. |
14 |
| - phen_index : :class:`int` |
15 |
| - The index of the phenotype (counting from 1) if multiple phenotype columns present in phenofile |
16 |
| -
|
| 6 | +def read_phenotype(file_path, column=None, column_index=None, na_values='NA'): |
| 7 | + """ |
| 8 | + Read data from a text file with header structure where either: |
| 9 | + - First two columns are 'FID' and 'IID' |
| 10 | + - First column is 'IID' |
| 11 | + |
| 12 | + Parameters: |
| 13 | + file_path (str): Path to the text file |
| 14 | + column (str, optional): Name of column to extract (other than 'FID' or 'IID') |
| 15 | + column_index (int, optional): Index of column to extract (counting from 1 after 'IID'/'FID') |
| 16 | + Note: This is 1-based indexing |
| 17 | + na_values (str or list, optional): String or list of strings to recognize as NA/NaN. Default is 'NA'. |
| 18 | + |
17 | 19 | Returns:
|
18 |
| - y : :class:`~numpy:numpy.array` |
19 |
| - vector of non-missing phenotype values from specified column of phenofile |
20 |
| - pheno_ids: :class:`~numpy:numpy.array` |
21 |
| - corresponding vector of individual IDs (IID) |
| 20 | + y : :class:`snipar.gtarray` |
| 21 | + vector of non-missing phenotype values from specified column of phenofile along with individual IDs |
| 22 | + |
| 23 | + Note: If neither column nor column_index is provided, defaults to first column after IID/FID |
22 | 24 | """
|
23 |
| - pheno = Pheno(phenofile, missing=missing_char)[:,phen_index-1].read() |
24 |
| - y = np.array(pheno.val) |
25 |
| - y.reshape((y.shape[0],1)) |
26 |
| - pheno_ids = np.array(pheno.iid)[:,1] |
27 |
| - # Remove y NAs |
28 |
| - y_not_nan = np.logical_not(np.isnan(y[:,0])) |
29 |
| - if np.sum(y_not_nan) < y.shape[0]: |
30 |
| - y = y[y_not_nan,:] |
31 |
| - pheno_ids = pheno_ids[y_not_nan] |
32 |
| - print('Number of non-missing phenotype observations: ' + str(y.shape[0])) |
33 |
| - return gtarray(y,ids=pheno_ids) |
| 25 | + # Determine delimiter (tab or whitespace) |
| 26 | + with open(file_path, 'r') as file: |
| 27 | + first_line = file.readline() |
| 28 | + delimiter = '\t' if '\t' in first_line else ' ' |
| 29 | + header = first_line.split(delimiter) |
| 30 | + header[-1] = header[-1].strip() # Remove newline character |
| 31 | + # Determine file format based on header |
| 32 | + has_fid = (len(header) > 1 and header[0] == 'FID' and header[1] == 'IID') |
| 33 | + # Set default column if neither is provided |
| 34 | + if column is None and column_index is None: |
| 35 | + # Default to first column after IID/FID |
| 36 | + column_index = 1 |
| 37 | + # Determine the usecols parameter for pd.read_csv |
| 38 | + if column is not None: |
| 39 | + if column in ['FID', 'IID']: |
| 40 | + raise ValueError(f"Phenotype cannot be named FID or IID") |
| 41 | + # We need to read the IID column and the target column |
| 42 | + cols_to_use = ['IID', column] |
| 43 | + else: # column_index is provided |
| 44 | + # Adjust column_index based on file format |
| 45 | + offset = 2 if has_fid else 1 |
| 46 | + adjusted_index = column_index + offset - 1 # -1 for 0-based indexing |
| 47 | + if adjusted_index >= len(header): |
| 48 | + raise ValueError(f"Column index {column_index} out of range") |
| 49 | + column = header[adjusted_index] |
| 50 | + cols_to_use = ['IID', column] |
| 51 | + print('Reading phenotype from column:', column) |
| 52 | + # Read the data using pandas for efficiency, handling missing values |
| 53 | + df = pd.read_csv(file_path, |
| 54 | + sep=delimiter, |
| 55 | + usecols=cols_to_use, |
| 56 | + na_values=na_values) |
| 57 | + # Verify target column contains numeric data |
| 58 | + try: |
| 59 | + df[column] = pd.to_numeric(df[column], errors='coerce') |
| 60 | + except ValueError: |
| 61 | + raise ValueError(f"Phenotype contains non-numeric values that cannot be converted") |
| 62 | + # Remove rows with missing values in either IID or target column |
| 63 | + df = df.dropna(subset=['IID', column]) |
| 64 | + # Return gtarray |
| 65 | + return gtarray(np.array(df[column].values).reshape((df.shape[0],1)), ids=np.array(df['IID'].values, dtype=str)) |
34 | 66 |
|
35 | 67 | def match_phenotype(G,y,pheno_ids):
|
36 | 68 | """Match a phenotype to a genotype array by individual IDs.
|
|
0 commit comments