forked from InternLM/Tutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
/
xlsx2jsonl.py
35 lines (26 loc) · 1.22 KB
/
xlsx2jsonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import openpyxl
import json
def process_excel_to_json(input_file, output_file):
# Load the workbook
wb = openpyxl.load_workbook(input_file)
# Select the "DrugQA" sheet
sheet = wb["DrugQA"]
# Initialize the output data structure
output_data = []
# Iterate through each row in column A and D
for row in sheet.iter_rows(min_row=2, max_col=4, values_only=True):
system_value = "You are a professional, highly experienced doctor professor. You always provide accurate, comprehensive, and detailed answers based on the patients' questions."
# Create the conversation dictionary
conversation = {
"system": system_value,
"input": row[0],
"output": row[3]
}
# Append the conversation to the output data
output_data.append({"conversation": [conversation]})
# Write the output data to a JSON file
with open(output_file, 'w', encoding='utf-8') as json_file:
json.dump(output_data, json_file, indent=4)
print(f"Conversion complete. Output written to {output_file}")
# Replace 'MedQA2019.xlsx' and 'output.jsonl' with your actual input and output file names
process_excel_to_json('MedQA2019.xlsx', 'output.jsonl')