generated from HugoBlox/theme-research-group
-
-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathindex_linking.py
108 lines (96 loc) · 4.67 KB
/
index_linking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import json
import os
import pandas as pd
import argparse
class IndexLinking:
def __init__(self, author_index_path, pub_dir):
self.author_index_path = author_index_path
self.pub_dir = pub_dir
# load the author_index_path
with open(author_index_path, "r") as f:
self.author_index = json.load(f)
def author_linking(self, query_name):
# check if the author_id is in the author_index
if query_name in self.author_index:
return ['In Index', query_name]
for name in self.author_index.values():
if query_name == name:
# Get the key of the name
key = list(self.author_index.keys())[list(self.author_index.values()).index(name)]
return ['In Value', key]
# Following is for the case that the author name is not exactly the same as the author name in the index
# get the subname of the query_name
subname_q = query_name.split(' ')
subname_n = name.split(' ')
# get the set of subname_q and subname_n
set_q = set(subname_q)
set_n = set(subname_n)
# check if the set_q is a subset of set_n
if set_q.issubset(set_n):
key = list(self.author_index.keys())[list(self.author_index.values()).index(name)]
return ['In Value by Subset', key]
return ['NOT FOUND', '']
def loop_pub_dir(self):
# loop through all subdirectories in the pub_dir
for pub_id in os.listdir(self.pub_dir):
if not os.path.isdir(os.path.join(self.pub_dir, pub_id)):
continue
# the actual page is index.md inside the pub_id directory
page_path = os.path.join(self.pub_dir, pub_id, "index.md")
print(page_path)
# check if the page_path is a file
if not os.path.isfile(page_path):
continue
# read the page
with open(page_path, "r") as f:
page_content = f.read()
# Extract the title from the page content
title_line = [line for line in page_content.split('\n') if line.startswith('title:')][0]
title = title_line.replace('title:', '').strip()
print('---------------')
print(pub_id, title)
# Extract the authors from the page content
author_lines = dict()
in_authors = False
for line in page_content.split('\n'):
if line.startswith('authors:'):
in_authors = True
continue
if in_authors:
if line.startswith('-') or line.startswith(' -'):
# author_name = line.replace('-', '').strip()
# check if the first letter is '-', if so, remove it
if line[0] == '-':
line = line[1:].strip()
if line.startswith(' -'):
line = line[3:].strip()
author_status = self.author_linking(line)
# author_lines.append(author_name + ' (' + author_status[0] + ' ' + author_status[1] + ')'))
author_lines[line] = author_status[1]
else:
if not line.startswith(' '):
in_authors = False
# Loop through each author
# for author in author_lines:
# print(f" Author: {author}, {author_lines[author]}")
print(author_lines)
# Let's read the page_content again
with open(page_path, "r") as f:
page_content = f.read()
for author_name, author_key in author_lines.items():
# look for "- {author_name}\n"
if f"- {author_name}\n" in page_content:
if len(author_key) > 0:
# replace the line with "- {author_key}\n"
page_content = page_content.replace(f"- {author_name}\n", f"- {author_key}\n")
print(f"Replaced {author_name} with {author_key}")
# Write the page_content back to the page_path
with open(page_path, "w") as f:
f.write(page_content)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--author_index_path", type=str, default="assets/author_index.json")
parser.add_argument("--pub_dir", type=str, default="content/publication")
args = parser.parse_args()
index_linking = IndexLinking(args.author_index_path, args.pub_dir)
index_linking.loop_pub_dir()