-
Notifications
You must be signed in to change notification settings - Fork 3
/
iba_scrape.py
executable file
·109 lines (90 loc) · 3.65 KB
/
iba_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#! /usr/bin/env python
# scrape the IBA pages for cocktail lists
import sys
import xml.etree.ElementTree as ET
from lxml import html
import requests
from pprint import pprint
from collections import OrderedDict
import json
url = 'http://iba-world.com/new-era-drinks/'
jsonfile = 'IBA_new_era_drinks.json'
url = 'http://iba-world.com/iba-cocktails/'
jsonfile = 'IBA_unforgettables.json'
url = 'http://iba-world.com/contemporary-classics/'
jsonfile = 'IBA_contemporary_classics.json'
jsonfile = 'IBA_.json'
recipes = OrderedDict()
page = requests.get(url)
tree = html.fromstring(page.content)
items = tree.findall(".//div[@class='blog_list_item_lists']")
for item in items:
name = item.find(".//h3").text
name = ' '.join([word.capitalize() for word in name.split()])
body = item.find(".//div[@class='blog_text']")
recipes[name] = {'unit': 'cL'}
print name
children = [c for c in body.iterchildren()]
n = 0
if children[1].tag == 'ul':
n = -1
style = children[n+1].text
if style is None:
try:
style = children[n+1].find('span').text
except:
pass
recipes[name]['style'] = style
recipes[name]['ingredients'] = OrderedDict()
if not children[n+2].tag == 'ul':
print "adapting <p> ingredients:", children[n+2].text
ing_list = ET.tostring(children[n+2]).lstrip('<p>').rstrip('</p>\n').split('<br />\n')
else:
ing_list = [i.text for i in children[n+2].iterchildren()]
for ingredient in ing_list:
if len(ingredient.split()) == 1:
recipes[name]['ingredients'][ingredient.lower()] = ''
continue
unit = ingredient.split()[1].lower()
if unit == 'cl':
recipes[name]['ingredients'][' '.join([w.lower() for w in ingredient.split()[2:]])] = float(ingredient.split()[0])
elif unit == 'bar' or unit == 'to': # bar spoon
recipes[name]['ingredients'][' '.join([w.lower() for w in ingredient.split()[3:]])] = ' '.join(ingredient.split()[:3])
elif unit == 'dashes' or unit == 'drops' or unit == 'with':
recipes[name]['ingredients'][' '.join([w.lower() for w in ingredient.split()[2:]])] = ' '.join(ingredient.split()[:2])
elif unit == 'dash':
recipes[name]['ingredients'][' '.join([w.lower() for w in ingredient.split()[2:]])] = 'dash'
else:
print "using literal: ", ingredient
literal = {'1': 'one', '2': 'two', 'A': 'one'}
try:
recipes[name]['ingredients'][' '.join([w.lower() for w in ingredient.split()[1:]])] = literal[ingredient.split()[0]]
except:
recipes[name]['ingredients'][ingredient.lower()] = ''
# Get full description from the link
ref_url = item.find(".//a[@class='top_hover_image']").attrib.get('href')
detail_page = requests.get(ref_url)
detail_tree = html.fromstring(detail_page.content)
use_next = False
for child in detail_tree.find(".//div[@class='col-sm-9']").iterchildren():
if use_next and child.tag == 'p':
recipes[name]['IBA_description'] = child.text
break
if child.tag =='ul':
use_next = True
with open(jsonfile, 'w') as fp:
json.dump(recipes, fp, indent=4, separators=(',', ': '))
print "Wrote out as {}".format(jsonfile)
sys.exit(0)
raw = sys.argv[1]
with open(raw) as fp:
for line in fp.readlines():
if line.lstrip().startswith(r'<h3>'):
print line.lstrip()
# super hax
if line.startswith(r'<p>'):
print line
if line.startswith(r'<li>'):
print line
if not line.lstrip().startswith('<'):
print line