Skip to content

Commit 2459171

Browse files
committed
feat: standardize fasta, multiple column primary key, fix
`polars_write_database` for list[number] types
1 parent 832daa8 commit 2459171

File tree

4 files changed

+84
-9
lines changed

4 files changed

+84
-9
lines changed

README.md

+7-2
Original file line numberDiff line numberDiff line change
@@ -94,18 +94,23 @@ This package also provides some useful functions to work with PostgreSQL, SMILES
9494
from bio_data_to_db.utils.postgresql import (
9595
create_db_if_not_exists,
9696
create_schema_if_not_exists,
97-
set_column_as_primary_key,
97+
make_int_column_primary_key_identity,
98+
make_columns_primary_key,
9899
make_columns_unique,
99100
make_large_columns_unique,
100101
split_column_str_to_list,
101-
polars_write_database,
102+
polars_write_database, # addressed issues with list columns
102103
)
103104

104105
from bio_data_to_db.utils.smiles import (
105106
canonical_smiles_wo_salt,
106107
polars_canonical_smiles_wo_salt,
107108
)
108109

110+
from bio_data_to_db.utils.fasta import (
111+
polars_standardize_fasta,
112+
)
113+
109114
from bio_data_to_db.utils.polars import (
110115
w_pbar,
111116
)

rust/Cargo.lock

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/bio_data_to_db/utils/fasta.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import polars as pl
2+
3+
4+
def polars_standardize_fasta(
5+
df: pl.DataFrame, fasta_col: str = "fasta", out_col: str = "fasta"
6+
) -> pl.DataFrame:
7+
"""
8+
Remove spaces and make it uppercase of a Polars column.
9+
"""
10+
df = df.with_columns(
11+
pl.col(fasta_col)
12+
.str.to_uppercase()
13+
.str.replace_all("\n", "")
14+
.str.replace_all(" ", "")
15+
.alias(out_col)
16+
)
17+
18+
return df

src/bio_data_to_db/utils/postgresql.py

+58-6
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def create_schema_if_not_exists(uri: str, schema_name: str, comment: str | None
162162
logger.exception(f"Error creating schema '{schema_name}' in DB '{db_name}'")
163163

164164

165-
def set_column_as_primary_key(
165+
def make_int_column_primary_key_identity(
166166
uri: str,
167167
*,
168168
schema_name: str = "public",
@@ -173,6 +173,7 @@ def set_column_as_primary_key(
173173
Make an existing index column (integer type) as primary key with auto increment (identity).
174174
175175
This is used because pl.DataFrame.write_database() doesn't support writing index column as primary key.
176+
Also, it will automatically set the start value of auto increment to the max value in the column.
176177
177178
Example:
178179
>>> df = pl.DataFrame({"smiles": ["CCO", "CCN", "CCC"]}) # doctest: +SKIP
@@ -227,6 +228,46 @@ def set_column_as_primary_key(
227228
)
228229

229230

231+
def make_columns_primary_key(
232+
uri: str,
233+
*,
234+
schema_name: str = "public",
235+
table_name: str,
236+
column_names: str | Sequence[str],
237+
):
238+
"""
239+
Make multiple columns as primary key but without auto increment (identity).
240+
241+
This is similar to make_columns_unique() but with primary key constraint.
242+
"""
243+
with psycopg.connect(
244+
conninfo=uri,
245+
) as conn:
246+
try:
247+
cursor = conn.cursor()
248+
249+
if isinstance(column_names, str):
250+
column_names = [column_names]
251+
252+
cursor.execute(
253+
sql.SQL("""
254+
ALTER TABLE {table}
255+
ADD PRIMARY KEY ({columns});
256+
""").format(
257+
table=sql.Identifier(schema_name, table_name),
258+
columns=sql.SQL(",").join(
259+
sql.Identifier(col) for col in column_names
260+
),
261+
)
262+
)
263+
conn.commit()
264+
265+
except psycopg.Error:
266+
logger.exception(
267+
f"Error setting primary key for column '{column_names}' in table '{table_name}'"
268+
)
269+
270+
230271
def make_columns_unique(
231272
uri: str,
232273
*,
@@ -250,9 +291,9 @@ def make_columns_unique(
250291

251292
cursor.execute(
252293
query=sql.SQL("""
253-
ALTER TABLE {table}
254-
ADD CONSTRAINT {table_unique_constraint}
255-
UNIQUE ({columns});
294+
ALTER TABLE {table}
295+
ADD CONSTRAINT {table_unique_constraint}
296+
UNIQUE ({columns});
256297
""").format(
257298
table=sql.Identifier(schema_name, table_name),
258299
table_unique_constraint=sql.Identifier(
@@ -267,7 +308,7 @@ def make_columns_unique(
267308

268309
except psycopg.Error:
269310
logger.exception(
270-
f"Error setting primary key for column '{column_names}' in table '{table_name}'"
311+
f"Error setting unique constraint for column '{column_names}' in table '{table_name}'"
271312
)
272313

273314

@@ -479,7 +520,18 @@ def polars_write_database(
479520
for col, dtype in columns_dtype.items()
480521
}
481522

482-
df.to_pandas(use_pyarrow_extension_array=True).to_sql(
523+
pd_df = df.to_pandas(use_pyarrow_extension_array=True)
524+
525+
# If any column has type list[number] in Polars, the pandas DataFrame will have a numpy array.
526+
# We need to convert it to a list, because `to_sql` doesn't support numpy arrays.
527+
for col, dtype in columns_dtype.items():
528+
if isinstance(dtype, pl.List):
529+
if isinstance(dtype.inner, pl.Utf8):
530+
continue
531+
pd_df[col] = pd_df[col].apply(lambda x: x.tolist())
532+
533+
# ic(pd_df)
534+
pd_df.to_sql(
483535
schema=schema_name,
484536
name=table_name,
485537
con=connection,

0 commit comments

Comments
 (0)