From bfa0e1398105048e4724b0ce81a6f239a92da85b Mon Sep 17 00:00:00 2001 From: gicraveiro Date: Mon, 7 Feb 2022 16:56:15 +0100 Subject: [PATCH] updated sentence annotation file with final version --- sentence_annotation.py | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/sentence_annotation.py b/sentence_annotation.py index 7766a8e..74732c9 100644 --- a/sentence_annotation.py +++ b/sentence_annotation.py @@ -1,60 +1,53 @@ import spacy import os import pdfx -import re from googleapiclient.discovery import build from google.oauth2 import service_account -def process_document(title, source_path,source, sheet, SAMPLE_SPREADSHEET_ID): +def process_document(title, source_path, sheet, SAMPLE_SPREADSHEET_ID): # READING AND MANIPULATING INPUT FILE path = 'data/'+source_path+'/'+title+'.pdf' - input_file = pdfx.PDFx(path) # TO DO: OPTIMIZE PATH, GET IT STRAIGHT FROM PARAMETER INSTEAD OF CALCULATING IT AGAIN + input_file = pdfx.PDFx(path) input_file = input_file.get_text() doc = nlp(input_file) values = [] + # Separating corpus in sentences for span in doc.sents: sentence = [] - #sent = re.sub("\n", " ", str(span)) # to get DATAPOLICY3 format comment this line, and add str casting to append - #span = re.sub("\n\n", " ", str(span)) sentence.append(str(span)) values.append(sentence) + # Formatting sentences to fill the spreadsheet value_input_option = 'USER_ENTERED' sentences = { 'values': values } - sheet.values().update(spreadsheetId=SAMPLE_SPREADSHEET_ID, range='Test'+'!A2:A2000',valueInputOption=value_input_option, body=sentences).execute()#title + # Write sentences in the annotation spreadsheet + sheet.values().update(spreadsheetId=SAMPLE_SPREADSHEET_ID, range=title+'!A2:A2000',valueInputOption=value_input_option, body=sentences).execute() +# Setting up access to annotation file in google spreadsheet -# If modifying these scopes, delete the file token.json. SCOPES = ['https://www.googleapis.com/auth/spreadsheets'] SERVICE_ACCOUNT_FILE = 'google_key.json' creds = service_account.Credentials.from_service_account_file( SERVICE_ACCOUNT_FILE, scopes=SCOPES) -# The ID and range of a sample spreadsheet. -#SAMPLE_SPREADSHEET_ID = '12mT4Fl9t3UVW8Jx8NjA8SJPVWDNH0lnkUgb6cM3ZiyQ' SAMPLE_SPREADSHEET_ID = '1trg0bot87WtOALsxiiEIVYX6VW6mIBr90GrsY-t2jRw' service = build('sheets', 'v4', credentials=creds) sheet = service.spreadsheets() -nlp = spacy.load('en_core_web_lg') # TRY REPRODUCING WITH SM -print ("Pipeline:", nlp. pipe_names) -nlp.enable_pipe("parser") -print ("Pipeline:", nlp. pipe_names) +nlp = spacy.load('en_core_web_sm') # Obs: splitting for annotation was executed with sm but ideally should have been with large model -path='Privacy/Facebook/TargetCompanySourced' # TO ADD DIFFERENT DOCUMENTS, REMOVE PART AFTER LAST / -source='TargetCompanySourced' +path='Privacy/Facebook/TargetCompanySourced' # TO ADD DIFFERENT DOCUMENTS, UPDATE PATH -process_document('DataPolicy', path, source, sheet, SAMPLE_SPREADSHEET_ID) - -#for filename in os.listdir('data/'+path): -# print(filename) -# file_name, file_extension = os.path.splitext(filename) -# process_document(file_name, path, source, sheet, SAMPLE_SPREADSHEET_ID) \ No newline at end of file +# Loops through all the documents in the specified folder +for filename in os.listdir('data/'+path): + print(filename) + file_name, file_extension = os.path.splitext(filename) + process_document(file_name, path, sheet, SAMPLE_SPREADSHEET_ID) \ No newline at end of file