-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpart-3.py
135 lines (104 loc) · 3.3 KB
/
part-3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import json
from pathlib import Path
import pandas as pd
def load_clean_json(file_path, columns_to_keep):
"""
Load JSON data from a file. Drop unnecessary columns and normalize
to DataFrame.
Parameters
----------
file_path : Path
Path to the JSON file.
columns_to_keep : list
List of columns to keep in the DataFrame.
Returns
-------
dict
Loaded JSON data.
"""
with file_path.open("r") as json_file:
json_data = json.load(json_file)
normalized_data = pd.json_normalize(json_data)
return normalized_data.filter(items=columns_to_keep)
def format_date(date_parts: list) -> str:
"""
Format date parts into a string.
Parameters
----------
date_parts : list
List containing year, month, and day.
Returns
-------
str
Formatted date string.
"""
return f"{date_parts[0]}-{date_parts[1]:02d}-{date_parts[2]:02d}"
def clean_title(value):
"""A function that removes a value contained in a list.
Parameters
----------
value : list or str
A list of strings or a string representing a title.
Returns
-------
str
The cleaned title (first part before ':') or the value as-is if it's not a list.
"""
try:
# Extract the first element if value is a list
title = value[0]
except TypeError:
# If value is not a list, use it directly
title = value
# Try splitting the title, catch AttributeError if it's not a string
try:
return title.split(":")[0].strip()
except AttributeError:
# Return the original value if it's not a string (e.g., int)
return title
def process_published_date(date_parts):
"""Parse a date provided as a list of values into a proper date format.
Handle the case where the date does not have numeric values.
Parameters
----------
date_parts : str or int
The elements of a date provided as a list from CrossRef
Returns
-------
pd.datetime
A date formatted as a pd.datetime object.
"""
try:
date_str = (
f"{date_parts[0][0]}-{date_parts[0][1]:02d}-{date_parts[0][2]:02d}"
)
return pd.to_datetime(date_str, format="%Y-%m-%d")
# You can chose to catch two errors together
# Or you can chose to handle them separately.
# You might handle them separately if you want to return custom
# messages to help the user understand the error
except (ValueError, TypeError) as e:
print(e)
return None
columns_to_keep = [
"publisher",
"DOI",
"type",
"author",
"is-referenced-by-count",
"title",
"published.date-parts",
]
current_dir = Path(__file__).parent
data_dir = current_dir / "data"
all_papers_list = []
for json_file in data_dir.glob("*.json"):
papers_df = load_clean_json(json_file, columns_to_keep)
# Add a step to the title function that tries to find the package name
papers_df["package_name"] = papers_df["title"].apply(clean_title)
papers_df["published_date"] = papers_df["published.date-parts"].apply(
process_published_date
)
all_papers_list.append(papers_df)
all_papers_df = pd.concat(all_papers_list, axis=0, ignore_index=True)
print("Final shape of combined DataFrame:", all_papers_df.shape)