Skip to content
This repository has been archived by the owner on Sep 18, 2024. It is now read-only.

Commit

Permalink
Send a website content as an email (#1)
Browse files Browse the repository at this point in the history
- The buggy part is: My email content is current empty!
  • Loading branch information
xinh3ng authored Jan 24, 2018
1 parent 4ca3ca9 commit 22fb282
Show file tree
Hide file tree
Showing 3 changed files with 123 additions and 49 deletions.
5 changes: 2 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@ pillow
tqdm
joblib

beautifulsoup4

opencv-python
scikit-learn
tensorflow
tensorboard
keras

pyqt5
spyder
Expand Down
146 changes: 100 additions & 46 deletions scrape_and_email/scrape_and_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,69 +4,123 @@
Procedure:
Invoke virtual env (Python 3.6)
$ python train_mammo_model.py --dataset_name=mnist --model_name=cnn --optimizer=adam --loss=categorical_crossentropy
$ python scrape_and_email.py
"""
from pdb import set_trace as debug
import os
import datetime
import locale
import urllib2
from BeautifulSoup import BeautifulSoup
import json
from pathlib import Path

from urllib.request import urlretrieve
import smtplib
from email.MIMEText import MIMEText
from email.Header import Header
from email.Utils import formatdate
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.utils import formatdate

from pydsutils.generic import create_logger

logger = create_logger('scrape_and_email')
now = datetime.datetime.today()

email_subject_map = {
'http://daily.awesomeport.cn': {
'subject': u'好东西传送门 - daily - ' + now.strftime("%Y-%m-%d %H:%M:%S")
}

}

web_charset = "utf-8"
mail_charset = "ISO-2022-JP"
def get_email_passwd(sender: str) -> str:
"""Retrieve email pass code
pass code must be saved in JSON format: {email_address: passwd}
Args:
sender: Sender email
"""
secretfile = '{home}/cred/email_logins.json'.format(home=os.environ['HOME'])
with open(secretfile) as f:
data = json.load(f)
assert sender in data.keys(), '{} pass code not found'.format(sender)
return data[sender]

targeturl = "http://hogehoge.com/" # Target URL for scraping
targetclass = "h1" # Target element for scraping

from_address = "[email protected]" # Sender address (Gmail address)
from_password = "gmailpassword" # Sender server password (Gmail password)
to_address = "[email protected]" # Recipient address
def gen_html_mesg(file):
"""Generate html content that can go as an email body
"""
content = Path(file).read_text()
message = MIMEText(content, 'html')
return message

statusOK = u"Found / "
statusNG = u"Not Found"

def scraping(url):
def scrape_url(url):
"""Scrape the give URL"""
status = {
'ok': 'Found',
'ng': 'Not Found'
}
logger.info('Start scraping the website content...')
tmp_html = '/tmp/web_content_to_html.html'
try:
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
target = soup.find(targetclass).renderContents()
if len(target) == 0:
return statusNG
else:
return statusOK + target.decode(web_charset)
urlretrieve(url, tmp_html)
except:
return statusNG

def create_message(from_addr, to_addr, subject, body, encoding):
msg = MIMEText(body, 'plain', encoding)
msg['From'] = from_addr
msg['To'] = to_addr
msg['Subject'] = Header(subject, encoding)
msg["Date"] = formatdate(localtime=True)
return MIMEText(status['ng'], 'plain')

message = gen_html_mesg(tmp_html)
logger.info('Successfully embedded the HTML as an email message...')
return message


def create_email(subject, message, sender, recipients):
msg = MIMEMultipart() # create a message
msg.add_header('From', sender)
msg.add_header('To', recipients)
msg.add_header('Subject', subject)
msg.add_header('Date', formatdate(localtime=True))
msg.attach(message)
logger.info('Successfully created the entire email')
return msg

def sendmail(subject, text):
msg = create_message(from_address, to_address, subject, text, mail_charset)
s = smtplib.SMTP('smtp.gmail.com', 587)
s.ehlo()
s.starttls()
s.ehlo()
s.login(from_address, from_password)
s.sendmail(from_address, to_address, msg.as_string())
s.close()

def send_email(mime_mesg, sender, recipients):
"""Send out an email"""

logger.info('Logging onto mail server...')
sess = smtplib.SMTP(host='smtp-mail.outlook.com', port=587)
sess.set_debuglevel(False) # set to True for verbose
sess.ehlo()
sess.starttls()
sess.ehlo()
sess.login(sender, get_email_passwd(sender))
logger.info('Successfully logged on')

sess.sendmail(sender, recipients, mime_mesg.as_string())
sess.quit()
logger.info('Successfully sent email and closed mail server')
return


def main(target_url: str,
sender: str = '[email protected]',
recipients: str = '[email protected]') -> None:
"""
Procedure is simple: create an email message -> log onto email account -> send the email
Args:
target_url: The website URL to be scraped
sender: Sender email
recipients: Recipients' emails
"""
subject = email_subject_map[target_url]['subject']
message = scrape_url(target_url)
msg = create_email(subject, message, sender, recipients)
send_email(msg, sender, recipients)
return


if __name__ == '__main__':
d = datetime.datetime.today()
time = d.strftime("%Y-%m-%d %H:%M:%S")
mailsubject = u"Page Scraping // " + time
mailmessage = scraping(targeturl)
sendmail(mailsubject, mailmessage)
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--target_url', default='http://daily.awesomeport.cn')

args = parser.parse_args()
main(**vars(args))
logger.info("ALL DONE")
21 changes: 21 additions & 0 deletions scrape_and_email/test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<html lang="zh-CN">

<head>
<title>好东西传送门</title>
<meta charset="UTF-8">
<meta name="description" content="好东西传送门,为您精选专业知识" />
<meta name="keywords" content="好东西传送门, 知识图谱, 人工智能, 机器学习, 科技资讯, 自然语言处理, NLP, 计算机视觉, CV, 大数据" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0">
<link rel="stylesheet" href="/dist/main.dc362cf20e521ae8d352.css">
</head>

<body>
<div id="app"></div>
<div id="ana">
<script src="https://s19.cnzz.com/z_stat.php?id=1269788086&web_id=1269788086" language="JavaScript"></script>
<script src="https://s13.cnzz.com/z_stat.php?id=1271283777&web_id=1271283777" language="JavaScript"></script>
</div>
<script type="text/javascript" src="/dist/vendors.dc362cf20e521ae8d352.js"></script>
<script type="text/javascript" src="/dist/main.dc362cf20e521ae8d352.js"></script>
</body>
</html>

0 comments on commit 22fb282

Please sign in to comment.