This repository has been archived by the owner on Sep 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Send a website content as an email (#1)
- The buggy part is: My email content is current empty!
- Loading branch information
Showing
3 changed files
with
123 additions
and
49 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,69 +4,123 @@ | |
Procedure: | ||
Invoke virtual env (Python 3.6) | ||
$ python train_mammo_model.py --dataset_name=mnist --model_name=cnn --optimizer=adam --loss=categorical_crossentropy | ||
$ python scrape_and_email.py | ||
""" | ||
from pdb import set_trace as debug | ||
import os | ||
import datetime | ||
import locale | ||
import urllib2 | ||
from BeautifulSoup import BeautifulSoup | ||
import json | ||
from pathlib import Path | ||
|
||
from urllib.request import urlretrieve | ||
import smtplib | ||
from email.MIMEText import MIMEText | ||
from email.Header import Header | ||
from email.Utils import formatdate | ||
from email.mime.multipart import MIMEMultipart | ||
from email.mime.text import MIMEText | ||
from email.utils import formatdate | ||
|
||
from pydsutils.generic import create_logger | ||
|
||
logger = create_logger('scrape_and_email') | ||
now = datetime.datetime.today() | ||
|
||
email_subject_map = { | ||
'http://daily.awesomeport.cn': { | ||
'subject': u'好东西传送门 - daily - ' + now.strftime("%Y-%m-%d %H:%M:%S") | ||
} | ||
|
||
} | ||
|
||
web_charset = "utf-8" | ||
mail_charset = "ISO-2022-JP" | ||
def get_email_passwd(sender: str) -> str: | ||
"""Retrieve email pass code | ||
pass code must be saved in JSON format: {email_address: passwd} | ||
Args: | ||
sender: Sender email | ||
""" | ||
secretfile = '{home}/cred/email_logins.json'.format(home=os.environ['HOME']) | ||
with open(secretfile) as f: | ||
data = json.load(f) | ||
assert sender in data.keys(), '{} pass code not found'.format(sender) | ||
return data[sender] | ||
|
||
targeturl = "http://hogehoge.com/" # Target URL for scraping | ||
targetclass = "h1" # Target element for scraping | ||
|
||
from_address = "[email protected]" # Sender address (Gmail address) | ||
from_password = "gmailpassword" # Sender server password (Gmail password) | ||
to_address = "[email protected]" # Recipient address | ||
def gen_html_mesg(file): | ||
"""Generate html content that can go as an email body | ||
""" | ||
content = Path(file).read_text() | ||
message = MIMEText(content, 'html') | ||
return message | ||
|
||
statusOK = u"Found / " | ||
statusNG = u"Not Found" | ||
|
||
def scraping(url): | ||
def scrape_url(url): | ||
"""Scrape the give URL""" | ||
status = { | ||
'ok': 'Found', | ||
'ng': 'Not Found' | ||
} | ||
logger.info('Start scraping the website content...') | ||
tmp_html = '/tmp/web_content_to_html.html' | ||
try: | ||
html = urllib2.urlopen(url).read() | ||
soup = BeautifulSoup(html) | ||
target = soup.find(targetclass).renderContents() | ||
if len(target) == 0: | ||
return statusNG | ||
else: | ||
return statusOK + target.decode(web_charset) | ||
urlretrieve(url, tmp_html) | ||
except: | ||
return statusNG | ||
|
||
def create_message(from_addr, to_addr, subject, body, encoding): | ||
msg = MIMEText(body, 'plain', encoding) | ||
msg['From'] = from_addr | ||
msg['To'] = to_addr | ||
msg['Subject'] = Header(subject, encoding) | ||
msg["Date"] = formatdate(localtime=True) | ||
return MIMEText(status['ng'], 'plain') | ||
|
||
message = gen_html_mesg(tmp_html) | ||
logger.info('Successfully embedded the HTML as an email message...') | ||
return message | ||
|
||
|
||
def create_email(subject, message, sender, recipients): | ||
msg = MIMEMultipart() # create a message | ||
msg.add_header('From', sender) | ||
msg.add_header('To', recipients) | ||
msg.add_header('Subject', subject) | ||
msg.add_header('Date', formatdate(localtime=True)) | ||
msg.attach(message) | ||
logger.info('Successfully created the entire email') | ||
return msg | ||
|
||
def sendmail(subject, text): | ||
msg = create_message(from_address, to_address, subject, text, mail_charset) | ||
s = smtplib.SMTP('smtp.gmail.com', 587) | ||
s.ehlo() | ||
s.starttls() | ||
s.ehlo() | ||
s.login(from_address, from_password) | ||
s.sendmail(from_address, to_address, msg.as_string()) | ||
s.close() | ||
|
||
def send_email(mime_mesg, sender, recipients): | ||
"""Send out an email""" | ||
|
||
logger.info('Logging onto mail server...') | ||
sess = smtplib.SMTP(host='smtp-mail.outlook.com', port=587) | ||
sess.set_debuglevel(False) # set to True for verbose | ||
sess.ehlo() | ||
sess.starttls() | ||
sess.ehlo() | ||
sess.login(sender, get_email_passwd(sender)) | ||
logger.info('Successfully logged on') | ||
|
||
sess.sendmail(sender, recipients, mime_mesg.as_string()) | ||
sess.quit() | ||
logger.info('Successfully sent email and closed mail server') | ||
return | ||
|
||
|
||
def main(target_url: str, | ||
sender: str = '[email protected]', | ||
recipients: str = '[email protected]') -> None: | ||
""" | ||
Procedure is simple: create an email message -> log onto email account -> send the email | ||
Args: | ||
target_url: The website URL to be scraped | ||
sender: Sender email | ||
recipients: Recipients' emails | ||
""" | ||
subject = email_subject_map[target_url]['subject'] | ||
message = scrape_url(target_url) | ||
msg = create_email(subject, message, sender, recipients) | ||
send_email(msg, sender, recipients) | ||
return | ||
|
||
|
||
if __name__ == '__main__': | ||
d = datetime.datetime.today() | ||
time = d.strftime("%Y-%m-%d %H:%M:%S") | ||
mailsubject = u"Page Scraping // " + time | ||
mailmessage = scraping(targeturl) | ||
sendmail(mailsubject, mailmessage) | ||
import argparse | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--target_url', default='http://daily.awesomeport.cn') | ||
|
||
args = parser.parse_args() | ||
main(**vars(args)) | ||
logger.info("ALL DONE") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
<html lang="zh-CN"> | ||
|
||
<head> | ||
<title>好东西传送门</title> | ||
<meta charset="UTF-8"> | ||
<meta name="description" content="好东西传送门,为您精选专业知识" /> | ||
<meta name="keywords" content="好东西传送门, 知识图谱, 人工智能, 机器学习, 科技资讯, 自然语言处理, NLP, 计算机视觉, CV, 大数据" /> | ||
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0"> | ||
<link rel="stylesheet" href="/dist/main.dc362cf20e521ae8d352.css"> | ||
</head> | ||
|
||
<body> | ||
<div id="app"></div> | ||
<div id="ana"> | ||
<script src="https://s19.cnzz.com/z_stat.php?id=1269788086&web_id=1269788086" language="JavaScript"></script> | ||
<script src="https://s13.cnzz.com/z_stat.php?id=1271283777&web_id=1271283777" language="JavaScript"></script> | ||
</div> | ||
<script type="text/javascript" src="/dist/vendors.dc362cf20e521ae8d352.js"></script> | ||
<script type="text/javascript" src="/dist/main.dc362cf20e521ae8d352.js"></script> | ||
</body> | ||
</html> |