-
-
Notifications
You must be signed in to change notification settings - Fork 3.6k
/
parse_json.py
130 lines (114 loc) · 4.13 KB
/
parse_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
"""Functions related to converting content into dict/JSON structures."""
from __future__ import absolute_import
import logging
import codecs
import fnmatch
import json
import os
from builtins import next, range # pylint: disable=redefined-builtin
from pyquery import PyQuery
log = logging.getLogger(__name__)
def process_all_json_files(version, build_dir=True):
"""Return a list of pages to index"""
if build_dir:
full_path = version.project.full_json_path(version.slug)
else:
full_path = version.project.get_production_media_path(
type_='json', version_slug=version.slug, include_file=False)
html_files = []
for root, _, files in os.walk(full_path):
for filename in fnmatch.filter(files, '*.fjson'):
if filename in ['search.fjson', 'genindex.fjson', 'py-modindex.fjson']:
continue
html_files.append(os.path.join(root, filename))
page_list = []
for filename in html_files:
try:
result = process_file(filename)
if result:
page_list.append(result)
# we're unsure which exceptions can be raised
except: # noqa
pass
return page_list
def process_headers(data, filename):
"""Read headers from toc data."""
headers = []
if 'toc' in data:
for element in PyQuery(data['toc'])('a'):
headers.append(recurse_while_none(element))
if None in headers:
log.info('Unable to index file headers for: %s', filename)
return headers
def generate_sections_from_pyquery(body):
"""Given a pyquery object, generate section dicts for each section."""
# Capture text inside h1 before the first h2
h1_section = body('.section > h1')
if h1_section:
div = h1_section.parent()
h1_title = h1_section.text().replace(u'¶', '').strip()
h1_id = div.attr('id')
h1_content = ""
next_p = body('h1').next()
while next_p:
if next_p[0].tag == 'div' and 'class' in next_p[0].attrib:
if 'section' in next_p[0].attrib['class']:
break
h1_content += "\n%s\n" % next_p.html()
next_p = next_p.next()
if h1_content:
yield {
'id': h1_id,
'title': h1_title,
'content': h1_content,
}
# Capture text inside h2's
section_list = body('.section > h2')
for num in range(len(section_list)):
div = section_list.eq(num).parent()
header = section_list.eq(num)
title = header.text().replace(u'¶', '').strip()
section_id = div.attr('id')
content = div.html()
yield {
'id': section_id,
'title': title,
'content': content,
}
def process_file(filename):
"""Read a file from disk and parse it into a structured dict."""
try:
with codecs.open(filename, encoding='utf-8', mode='r') as f:
file_contents = f.read()
except IOError as e:
log.info('Unable to index file: %s', filename, exc_info=True)
return
data = json.loads(file_contents)
sections = []
title = ''
body_content = ''
if 'current_page_name' in data:
path = data['current_page_name']
else:
log.info('Unable to index file due to no name %s', filename)
return None
if 'body' in data and data['body']:
body = PyQuery(data['body'])
body_content = body.text().replace(u'¶', '')
sections.extend(generate_sections_from_pyquery(body))
else:
log.info('Unable to index content for: %s', filename)
if 'title' in data:
title = data['title']
if title.startswith('<'):
title = PyQuery(data['title']).text()
else:
log.info('Unable to index title for: %s', filename)
return {'headers': process_headers(data, filename),
'content': body_content, 'path': path,
'title': title, 'sections': sections}
def recurse_while_none(element):
if element.text is None:
return recurse_while_none(element.getchildren()[0])
return element.text