diff --git a/scraper.py b/scraper.py index 7256318..bf285b2 100755 --- a/scraper.py +++ b/scraper.py @@ -4,12 +4,12 @@ # pulls company information from site to save time that would be spent manually typing out the info # Gavin Inglis # January 2019 +# Updated September 2023 BJI from selenium import webdriver -from selenium.webdriver.common.keys import Keys -from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By -import zipfile import time import datetime import gspread @@ -18,19 +18,7 @@ import re import getpass -# Get latest chromedriver zip file for mac, extract into same folder -try: - version = requests.get('https://chromedriver.storage.googleapis.com/LATEST_RELEASE').text - url = 'https://chromedriver.storage.googleapis.com/{0}/{1}'.format(version, 'chromedriver_mac64.zip') - r = requests.get(url, allow_redirects=True) - open('chromedriver.zip', 'wb').write(r.content) - with zipfile.ZipFile("chromedriver.zip", "r") as zip_ref: - zip_ref.extractall() -except: - pass - '''Globals''' - GOOGLE_URL = 'http://www.google.com/search' # scope of access for api @@ -38,34 +26,34 @@ 'https://www.googleapis.com/auth/drive'] # credentials file generated by google developer console when creating sheets api -credentials = ServiceAccountCredentials.from_json_keyfile_name('PATH TO YOUR CREDENTIALS', scope) +credentials = ServiceAccountCredentials.from_json_keyfile_name('PATH_TO_YOUR_CREDS', scope) gc = gspread.authorize(credentials) # login url for site url = 'https://www.magicformulainvesting.com/Account/LogOn' +# declare driver as chrome headless instance +service = Service() options = webdriver.ChromeOptions() options.add_argument('headless') -# declare driver as chrome headless instance -driver = webdriver.Chrome(executable_path="./chromedriver", options=options) +driver = webdriver.Chrome(service=service, options=options) '''Functions''' def scrapeSite(): - - print("Scraping stock info...") # update for terminal + print('Scraping stock info...') # find all td elements, write needed elements to file - trs=driver.find_elements_by_xpath('//table[@class="divheight screeningdata"]/tbody/tr') + trs = driver.find_elements(By.XPATH,'//table[@class="divheight screeningdata"]/tbody/tr') names = [] tikrs = [] for tr in trs: - td = tr.find_elements_by_xpath(".//td") + td = tr.find_elements(By.XPATH,".//td") - company_name=td[0].get_attribute("innerHTML") - company_tikr=td[1].get_attribute("innerHTML") + company_name = td[0].get_attribute("innerHTML") + company_tikr = td[1].get_attribute("innerHTML") names.append(company_name) tikrs.append(company_tikr) @@ -73,16 +61,14 @@ def scrapeSite(): return names, tikrs def writeSheet(names, tikrs): - - print("Writing to sheet...") # update to terminal + print('Writing to sheet...') # access sheet by url - wks = gc.open_by_url("YOUR URL HERE").get_worksheet(1) # worksheet number - - #wks.append_row([' '], table_range='A1') # append a blank line before tickers as requested by OC - - date=datetime.datetime.today().strftime('%Y-%m-%d') # current date - wks.append_row([date], table_range='A1') # append the date, starts in first column + wks = gc.open_by_url("YOUR URL HERE").get_worksheet(1) # worksheet num 1 is Research + + date = datetime.datetime.today().strftime('%Y-%m-%d') # current date + # wks.append_row([date], table_range='A1') # append the date starting in first column + wks.append_row([date]) for i in range(len(names)): price = '=GOOGLEFINANCE("' + tikrs[i] + '","price")' @@ -90,14 +76,14 @@ def writeSheet(names, tikrs): query = names[i] url = getUrl(query) - - wks.append_row([names[i],tikrs[i], price, url], table_range='A1', value_input_option="USER_ENTERED") # start in first column + + wks.append_row([names[i],tikrs[i], price, url], value_input_option="USER_ENTERED") def getUrl(companyName): - url = GOOGLE_URL + '?q=' + companyName + url = GOOGLE_URL + '?q=' + companyName result = requests.get(url) # fancy regex courtesy of pbui - urls = re.findall('/url\?q=([^&]*)', result.text) + urls = re.findall('/url\?q=([^&]*)', result.text) return urls[0] '''Main Execution''' @@ -106,30 +92,31 @@ def getUrl(companyName): driver.get(url) # find the input elements for logging in -username=driver.find_element_by_name("Email") -password=driver.find_element_by_name("Password") +username=driver.find_element(By.NAME,"Email") +password=driver.find_element(By.NAME,"Password") # enter email and password. uses getpass to hide password (i.e. not using plaintext) -your_email=raw_input("Please enter your email for magicformulainvesting.com: ") -your_password=getpass.getpass("Please enter your password for magicformulainvesting.com: ") +# Replace with raw_input() with input() for python 3 +your_email = input("Please enter your email for magicformulainvesting.com: ") + +# Have to run scaper.py from terminal so getpass will work :) +your_password = getpass.getpass("Please enter your password for magicformulainvesting.com: ") + +# selenium sends info to mfi.com username.send_keys(your_email) password.send_keys(your_password) -# enter email and password (for hard coding only) -# username.send_keys("EMAIL") -# password.send_keys("PASSWORD") - # click login button -button=driver.find_element_by_name("login") +button = driver.find_element(By.NAME,"login") button.click() -time.sleep(1) # seconds +time.sleep(1) # seconds -# use xpathing to find the radio button element for 50 stocks and click it -radio = driver.find_element_by_xpath('//input[@value="false" and contains(@name,"Select30")]') +# use xpath to find the radio button element for 50 stocks and click it +radio = driver.find_element(By.XPATH,'//*[@id="Select30" and @value="false"]') radio.click() -button2=driver.find_element_by_name("stocks") +button2 = driver.find_element(By.NAME,"stocks") button2.click() time.sleep(.5)