Skip to content

Commit

Permalink
updated sentence annotation file with final version
Browse files Browse the repository at this point in the history
  • Loading branch information
gicraveiro committed Feb 7, 2022
1 parent 9998271 commit bfa0e13
Showing 1 changed file with 14 additions and 21 deletions.
35 changes: 14 additions & 21 deletions sentence_annotation.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,53 @@
import spacy
import os
import pdfx
import re
from googleapiclient.discovery import build
from google.oauth2 import service_account

def process_document(title, source_path,source, sheet, SAMPLE_SPREADSHEET_ID):
def process_document(title, source_path, sheet, SAMPLE_SPREADSHEET_ID):

# READING AND MANIPULATING INPUT FILE
path = 'data/'+source_path+'/'+title+'.pdf'
input_file = pdfx.PDFx(path) # TO DO: OPTIMIZE PATH, GET IT STRAIGHT FROM PARAMETER INSTEAD OF CALCULATING IT AGAIN
input_file = pdfx.PDFx(path)
input_file = input_file.get_text()

doc = nlp(input_file)

values = []

# Separating corpus in sentences
for span in doc.sents:
sentence = []
#sent = re.sub("\n", " ", str(span)) # to get DATAPOLICY3 format comment this line, and add str casting to append
#span = re.sub("\n\n", " ", str(span))
sentence.append(str(span))
values.append(sentence)

# Formatting sentences to fill the spreadsheet
value_input_option = 'USER_ENTERED'
sentences = {
'values': values
}

sheet.values().update(spreadsheetId=SAMPLE_SPREADSHEET_ID, range='Test'+'!A2:A2000',valueInputOption=value_input_option, body=sentences).execute()#title
# Write sentences in the annotation spreadsheet
sheet.values().update(spreadsheetId=SAMPLE_SPREADSHEET_ID, range=title+'!A2:A2000',valueInputOption=value_input_option, body=sentences).execute()

# Setting up access to annotation file in google spreadsheet

# If modifying these scopes, delete the file token.json.
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']
SERVICE_ACCOUNT_FILE = 'google_key.json'

creds = service_account.Credentials.from_service_account_file(
SERVICE_ACCOUNT_FILE, scopes=SCOPES)

# The ID and range of a sample spreadsheet.
#SAMPLE_SPREADSHEET_ID = '12mT4Fl9t3UVW8Jx8NjA8SJPVWDNH0lnkUgb6cM3ZiyQ'
SAMPLE_SPREADSHEET_ID = '1trg0bot87WtOALsxiiEIVYX6VW6mIBr90GrsY-t2jRw'
service = build('sheets', 'v4', credentials=creds)
sheet = service.spreadsheets()

nlp = spacy.load('en_core_web_lg') # TRY REPRODUCING WITH SM
print ("Pipeline:", nlp. pipe_names)
nlp.enable_pipe("parser")
print ("Pipeline:", nlp. pipe_names)
nlp = spacy.load('en_core_web_sm') # Obs: splitting for annotation was executed with sm but ideally should have been with large model

path='Privacy/Facebook/TargetCompanySourced' # TO ADD DIFFERENT DOCUMENTS, REMOVE PART AFTER LAST /
source='TargetCompanySourced'
path='Privacy/Facebook/TargetCompanySourced' # TO ADD DIFFERENT DOCUMENTS, UPDATE PATH

process_document('DataPolicy', path, source, sheet, SAMPLE_SPREADSHEET_ID)

#for filename in os.listdir('data/'+path):
# print(filename)
# file_name, file_extension = os.path.splitext(filename)
# process_document(file_name, path, source, sheet, SAMPLE_SPREADSHEET_ID)
# Loops through all the documents in the specified folder
for filename in os.listdir('data/'+path):
print(filename)
file_name, file_extension = os.path.splitext(filename)
process_document(file_name, path, sheet, SAMPLE_SPREADSHEET_ID)

0 comments on commit bfa0e13

Please sign in to comment.