Skip to content

Commit

Permalink
Basic check on lineage notes (2 column tsv with unique keys)
Browse files Browse the repository at this point in the history
  • Loading branch information
corneliusroemer committed Sep 24, 2023
1 parent a452e5e commit 4127017
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,9 @@ repos:
entry: python tests/test_duplicate_designations.py
language: python
files: lineages.csv
- id: duplicate-lineage-in-notes
name: Test lineage_notes.txt for duplicate first column entries
entry: python tests/test_duplicate_lineage_in_notes.py
language: python
files: lineage_notes.txt

42 changes: 42 additions & 0 deletions tests/test_duplicate_lineage_in_notes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env python3


def check_for_duplicates(filename, buffer_size=1024 * 1024): # 1MB buffer
seen = set()
buffer = ""
is_valid = True
line_counter = 0 # Counter for tracking the number of lines seen

with open(filename, "r") as f:
while True:
chunk = f.read(buffer_size)
if not chunk:
break

buffer += chunk
lines = buffer.split("\n")
for i in range(len(lines) - 1):
line_counter += 1 # Increment the line counter
fields = lines[i].split("\t")

if len(fields) > 2:
print(f"lineage_notes.txt: Line {line_counter} has more than 2 fields `{fields}`")
is_valid = False

field = fields[0]
if field in seen:
print(f"Duplicate {field} found in line {line_counter}")
is_valid = False
seen.add(field)
buffer = lines[
-1
] # keep the last, potentially incomplete line for the next iteration

return is_valid


# Usage
if check_for_duplicates("lineage_notes.txt") is False:
exit(1) # Exit with status 1 if duplicates found
else:
exit(0) # Exit with status 0 otherwise

0 comments on commit 4127017

Please sign in to comment.