上传代码

cy576013581 · Dec 21, 2017 · 9dd8b3a · 9dd8b3a
1 parent 5268faa
commit 9dd8b3a
Show file tree

Hide file tree

Showing 6 changed files with 2,637 additions and 0 deletions.
diff --git a/html_downloader.py b/html_downloader.py
@@ -0,0 +1,15 @@
+import urllib.request as urllib
+class HtmlDownloader(object):
+
+
+    def __init__(self):
+        pass
+
+
+    def download(self,url):
+        if url is None:
+            return None
+        response = urllib.urlopen(url)
+        if response.getcode() != 200:
+            return None
+        return response.read()
diff --git a/html_outputer.py b/html_outputer.py
@@ -0,0 +1,22 @@
+# coding:utf-8
+class HtmlOutputer(object):
+    def __init__(self):
+        self.datas = []
+    def collect_data(self,data):
+        if data is None:
+            return None
+        self.datas.append(data)
+    def output_html(self):
+        fout = open('output.html','w',encoding='utf-8')
+        fout.write("<html>")
+        fout.write("<body>")
+        fout.write("<table>")
+        for data in self.datas:
+            fout.write("<tr>")
+            fout.write("<td>%s</td>" % data['url'])
+            fout.write("<td>%s</td>" % data['title'])
+            fout.write("<td>%s</td>" % data['summary'])
+            fout.write("</tr>")
+        fout.write("</table>")
+        fout.write("</body>")
+        fout.write("</html>")
diff --git a/html_parser.py b/html_parser.py
@@ -0,0 +1,30 @@
+# encoding=utf-8
+from bs4 import BeautifulSoup
+import re
+import urllib.parse as urlparse
+
+class HtmlParser(object):
+    def parser(self,page_url,html_cont):
+        if page_url is None or html_cont is None:
+            return None
+        soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
+        new_urls = self._get_new_urls(page_url,soup)
+        new_data = self._get_new_data(page_url,soup)
+        return new_urls,new_data
+    def _get_new_urls(self,page_url,soup):
+        new_urls = set()
+        links = soup.find_all('a', href=re.compile(r"/item/\w"))
+        for link in links:
+            new_url = link['href']
+            new_full_url = urlparse.urljoin(page_url, new_url)
+            new_urls.add(new_full_url)
+        return new_urls
+    def _get_new_data(self,page_url,soup):
+        res_data = {}
+        res_data['url'] = page_url
+        title_node= soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")
+
+        res_data['title'] = title_node.get_text()
+        summary_ndoe = soup.find('div', class_="lemma-summary")
+        res_data['summary'] = summary_ndoe.get_text()
+        return res_data