Skip to content
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 34 additions & 45 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
# pulls company information from site to save time that would be spent manually typing out the info
# Gavin Inglis
# January 2019
# Updated September 2023 BJI

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

import zipfile
import time
import datetime
import gspread
Expand All @@ -18,51 +18,39 @@
import re
import getpass

# Get latest chromedriver zip file for mac, extract into same folder
try:
version = requests.get('https://chromedriver.storage.googleapis.com/LATEST_RELEASE').text
url = 'https://chromedriver.storage.googleapis.com/{0}/{1}'.format(version, 'chromedriver_mac64.zip')
r = requests.get(url, allow_redirects=True)
open('chromedriver.zip', 'wb').write(r.content)
with zipfile.ZipFile("chromedriver.zip", "r") as zip_ref:
zip_ref.extractall()
except:
pass

'''Globals'''

GOOGLE_URL = 'http://www.google.com/search'

# scope of access for api
scope = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']

# credentials file generated by google developer console when creating sheets api
credentials = ServiceAccountCredentials.from_json_keyfile_name('PATH TO YOUR CREDENTIALS', scope)
credentials = ServiceAccountCredentials.from_json_keyfile_name('PATH_TO_YOUR_CREDS', scope)
gc = gspread.authorize(credentials)

# login url for site
url = 'https://www.magicformulainvesting.com/Account/LogOn'

# declare driver as chrome headless instance
service = Service()
options = webdriver.ChromeOptions()
options.add_argument('headless')

# declare driver as chrome headless instance
driver = webdriver.Chrome(executable_path="./chromedriver", options=options)
driver = webdriver.Chrome(service=service, options=options)

'''Functions'''
def scrapeSite():

print("Scraping stock info...") # update for terminal
print('Scraping stock info...')

# find all td elements, write needed elements to file
trs=driver.find_elements_by_xpath('//table[@class="divheight screeningdata"]/tbody/tr')
trs=driver.find_elements(By.XPATH,'//table[@class="divheight screeningdata"]/tbody/tr')

names = []
tikrs = []

for tr in trs:
td = tr.find_elements_by_xpath(".//td")
td = tr.find_elements(By.XPATH,".//td")

company_name=td[0].get_attribute("innerHTML")
company_tikr=td[1].get_attribute("innerHTML")
Expand All @@ -73,16 +61,15 @@ def scrapeSite():
return names, tikrs

def writeSheet(names, tikrs):

print("Writing to sheet...") # update to terminal
print('Writing to sheet...')

# access sheet by url
wks = gc.open_by_url("YOUR URL HERE").get_worksheet(1) # worksheet number

#wks.append_row([' '], table_range='A1') # append a blank line before tickers as requested by OC
date=datetime.datetime.today().strftime('%Y-%m-%d') # current date
wks.append_row([date], table_range='A1') # append the date, starts in first column
wks = gc.open_by_url("YOUR URL HERE"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comma?

"/edit?usp=sharing").get_worksheet(1) # worksheet num 1 is Research

date=datetime.datetime.today().strftime('%Y-%m-%d') # current date
# wks.append_row([date], table_range='A1') # append the date starting in first column
wks.append_row([date])

for i in range(len(names)):
price = '=GOOGLEFINANCE("' + tikrs[i] + '","price")'
Expand All @@ -91,13 +78,14 @@ def writeSheet(names, tikrs):

url = getUrl(query)

wks.append_row([names[i],tikrs[i], price, url], table_range='A1', value_input_option="USER_ENTERED") # start in first column
# wks.append_row([names[i],tikrs[i], price, url], table_range='A1', value_input_option="USER_ENTERED")
wks.append_row([names[i],tikrs[i], price, url], value_input_option="USER_ENTERED")

def getUrl(companyName):
url = GOOGLE_URL + '?q=' + companyName
url= GOOGLE_URL + '?q=' + companyName
result = requests.get(url)
# fancy regex courtesy of pbui
urls = re.findall('/url\?q=([^&]*)', result.text)
urls= re.findall('/url\?q=([^&]*)', result.text)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: spacing

return urls[0]

'''Main Execution'''
Expand All @@ -106,30 +94,31 @@ def getUrl(companyName):
driver.get(url)

# find the input elements for logging in
username=driver.find_element_by_name("Email")
password=driver.find_element_by_name("Password")
username=driver.find_element(By.NAME,"Email")
password=driver.find_element(By.NAME,"Password")

# enter email and password. uses getpass to hide password (i.e. not using plaintext)
your_email=raw_input("Please enter your email for magicformulainvesting.com: ")
# Replace with raw_input() with input() for python 3
your_email= input("Please enter your email for magicformulainvesting.com: ")
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

spacing :)


# Have to run scaper.py from terminal so getpass will work :)
your_password=getpass.getpass("Please enter your password for magicformulainvesting.com: ")

# selenium sends info to mfi.com
username.send_keys(your_email)
password.send_keys(your_password)

# enter email and password (for hard coding only)
# username.send_keys("EMAIL")
# password.send_keys("PASSWORD")

# click login button
button=driver.find_element_by_name("login")
button=driver.find_element(By.NAME,"login")
button.click()

time.sleep(1) # seconds
time.sleep(1) # seconds

# use xpathing to find the radio button element for 50 stocks and click it
radio = driver.find_element_by_xpath('//input[@value="false" and contains(@name,"Select30")]')
# use xpath to find the radio button element for 50 stocks and click it
radio = driver.find_element(By.XPATH,'//*[@id="Select30" and @value="false"]')
radio.click()

button2=driver.find_element_by_name("stocks")
button2=driver.find_element(By.NAME,"stocks")
button2.click()

time.sleep(.5)
Expand Down