diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e62ad983..d45d383be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Changelog +## v0.5.9 +Released 05 September 2015 + +Highlights: +* Added: Providers Strike, Jackett, custom Torznabs +* Added: Option to stop post-processing if no good match found (#2343) +* Fixed: Blackhole -> Magnet, limit to torcache +* Fixed: Kat 403 flac error +* Fixed: Last.fm errors +* Fixed: Pushover notifications +* Improved: Rutracker logging, switched to requests lib + +The full list of commits can be found [here](https://github.com/rembo10/headphones/compare/v0.5.6...v0.5.7). + ## v0.5.8 Released 13 July 2015 diff --git a/data/interfaces/default/config.html b/data/interfaces/default/config.html index 7478e5ede..5688ce6d3 100644 --- a/data/interfaces/default/config.html +++ b/data/interfaces/default/config.html @@ -533,22 +533,6 @@

Settings

-
-
- -
-
-
- - -
-
- - -
-
-
-
@@ -626,6 +610,87 @@

Settings

+
+
+ +
+
+
+ + +
+
+
+ +
+
+ +
+
+
+
+ + + e.g. http://localhost:9117/torznab/iptorrents +
+
+ + +
+
+ +
+
+ <% + torznab_number = 2 + %> + %for torznab in config['extra_torznabs']: + <% + if torznab[2] == '1' or torznab[2] == 1: + torznab_enabled = "checked" + else: + torznab_enabled = "" + %> +
+
+ + +
+
+ + +
+
+ +
+
+ +
+
+ <% + torznab_number += 1 + %> + %endfor + +
+
+ +
+
+ +
+
+
+ + +
+
+ + +
+
+
+
@@ -683,7 +748,7 @@

Settings

- Target bitrate: + Target bitrate: kbps
@@ -1393,6 +1458,10 @@

Settings

+
+ +
', r.text) + if regex is None: + logger.debug('Error reading token') + return + + self.session.params = {'token': regex.group(1)} + files = {'torrent_file': ("", data)} + + try: + self.session.post(url, params={'action': 'add-file'}, files=files) + except Exception as e: + logger.exception('Error adding file to utorrent %s', e) + diff --git a/headphones/searcher.py b/headphones/searcher.py index 6af321315..7dbcf5ba5 100644 --- a/headphones/searcher.py +++ b/headphones/searcher.py @@ -36,24 +36,20 @@ from headphones.common import USER_AGENT from headphones import logger, db, helpers, classes, sab, nzbget, request -from headphones import utorrent, transmission, notifiers +from headphones import utorrent, transmission, notifiers, rutracker from bencode import bencode, bdecode -import headphones.searcher_rutracker as rutrackersearch - # Magnet to torrent services, for Black hole. Stolen from CouchPotato. TORRENT_TO_MAGNET_SERVICES = [ - 'https://zoink.it/torrent/%s.torrent', - 'http://torrage.com/torrent/%s.torrent', + #'https://zoink.it/torrent/%s.torrent', + #'http://torrage.com/torrent/%s.torrent', 'https://torcache.net/torrent/%s.torrent', ] # Persistent What.cd API object gazelle = None - -# RUtracker search object -rutracker = rutrackersearch.Rutracker() +ruobj = None def fix_url(s, charset="utf-8"): @@ -168,6 +164,8 @@ def get_seed_ratio(provider): seed_ratio = headphones.CONFIG.WAFFLES_RATIO elif provider == 'Mininova': seed_ratio = headphones.CONFIG.MININOVA_RATIO + elif provider == 'Strike': + seed_ratio = headphones.CONFIG.STRIKE_RATIO else: seed_ratio = None @@ -232,7 +230,7 @@ def do_sorted_search(album, new, losslessOnly, choose_specific_download=False): NZB_PROVIDERS = (headphones.CONFIG.HEADPHONES_INDEXER or headphones.CONFIG.NEWZNAB or headphones.CONFIG.NZBSORG or headphones.CONFIG.OMGWTFNZBS) NZB_DOWNLOADERS = (headphones.CONFIG.SAB_HOST or headphones.CONFIG.BLACKHOLE_DIR or headphones.CONFIG.NZBGET_HOST) - TORRENT_PROVIDERS = (headphones.CONFIG.KAT or headphones.CONFIG.PIRATEBAY or headphones.CONFIG.OLDPIRATEBAY or headphones.CONFIG.MININOVA or headphones.CONFIG.WAFFLES or headphones.CONFIG.RUTRACKER or headphones.CONFIG.WHATCD) + TORRENT_PROVIDERS = (headphones.CONFIG.TORZNAB or headphones.CONFIG.KAT or headphones.CONFIG.PIRATEBAY or headphones.CONFIG.OLDPIRATEBAY or headphones.CONFIG.MININOVA or headphones.CONFIG.WAFFLES or headphones.CONFIG.RUTRACKER or headphones.CONFIG.WHATCD or headphones.CONFIG.STRIKE) results = [] myDB = db.DBConnection() @@ -793,10 +791,11 @@ def send_to_downloader(data, bestqual, album): # Randomize list of services services = TORRENT_TO_MAGNET_SERVICES[:] random.shuffle(services) + headers = {'User-Agent': USER_AGENT} for service in services: - data = request.request_content(service % torrent_hash) + data = request.request_content(service % torrent_hash, headers=headers) if data and "torcache" in data: if not torrent_to_file(download_path, data): return @@ -818,15 +817,9 @@ def send_to_downloader(data, bestqual, album): "to open or convert magnet links") return else: - if bestqual[3] == "rutracker.org": - download_path, _ = rutracker.get_torrent(bestqual[2], - headphones.CONFIG.TORRENTBLACKHOLE_DIR) - if not download_path: - return - else: - if not torrent_to_file(download_path, data): - return + if not torrent_to_file(download_path, data): + return # Extract folder name from torrent folder_name = read_torrent_name(download_path, bestqual[0]) @@ -836,13 +829,11 @@ def send_to_downloader(data, bestqual, album): elif headphones.CONFIG.TORRENT_DOWNLOADER == 1: logger.info("Sending torrent to Transmission") - # rutracker needs cookies to be set, pass the .torrent file instead of url + # Add torrent if bestqual[3] == 'rutracker.org': - file_or_url, torrentid = rutracker.get_torrent(bestqual[2]) + torrentid = transmission.addTorrent('', data) else: - file_or_url = bestqual[2] - - torrentid = transmission.addTorrent(file_or_url) + torrentid = transmission.addTorrent(bestqual[2]) if not torrentid: logger.error("Error sending torrent to Transmission. Are you sure it's running?") @@ -855,13 +846,6 @@ def send_to_downloader(data, bestqual, album): logger.error('Torrent folder name could not be determined') return - # remove temp .torrent file created above - if bestqual[3] == 'rutracker.org': - try: - shutil.rmtree(os.path.split(file_or_url)[0]) - except Exception as e: - logger.exception("Unhandled exception") - # Set Seed Ratio seed_ratio = get_seed_ratio(bestqual[3]) if seed_ratio is not None: @@ -870,29 +854,29 @@ def send_to_downloader(data, bestqual, album): else:# if headphones.CONFIG.TORRENT_DOWNLOADER == 2: logger.info("Sending torrent to uTorrent") - # rutracker needs cookies to be set, pass the .torrent file instead of url + # Add torrent if bestqual[3] == 'rutracker.org': - file_or_url, torrentid = rutracker.get_torrent(bestqual[2]) - folder_name, cacheid = utorrent.dirTorrent(torrentid) - folder_name = os.path.basename(os.path.normpath(folder_name)) - utorrent.labelTorrent(torrentid) + ruobj.utorrent_add_file(data) else: - file_or_url = bestqual[2] - torrentid = calculate_torrent_hash(file_or_url, data) - folder_name = utorrent.addTorrent(file_or_url, torrentid) + utorrent.addTorrent(bestqual[2]) + # Get hash + torrentid = calculate_torrent_hash(bestqual[2], data) + if not torrentid: + logger.error('Torrent id could not be determined') + return + + # Get folder + folder_name = utorrent.getFolder(torrentid) if folder_name: logger.info('Torrent folder name: %s' % folder_name) else: logger.error('Torrent folder name could not be determined') return - # remove temp .torrent file created above - if bestqual[3] == 'rutracker.org': - try: - shutil.rmtree(os.path.split(file_or_url)[0]) - except Exception as e: - logger.exception("Unhandled exception") + # Set Label + if headphones.CONFIG.UTORRENT_LABEL: + utorrent.labelTorrent(torrentid) # Set Seed Ratio seed_ratio = get_seed_ratio(bestqual[3]) @@ -932,7 +916,7 @@ def send_to_downloader(data, bestqual, album): if headphones.CONFIG.PUSHBULLET_ENABLED and headphones.CONFIG.PUSHBULLET_ONSNATCH: logger.info(u"Sending PushBullet notification") pushbullet = notifiers.PUSHBULLET() - pushbullet.notify(name + " has been snatched!", "Download started") + pushbullet.notify(name, "Download started") if headphones.CONFIG.TWITTER_ENABLED and headphones.CONFIG.TWITTER_ONSNATCH: logger.info(u"Sending Twitter notification") twitter = notifiers.TwitterNotifier() @@ -1041,12 +1025,7 @@ def verifyresult(title, artistterm, term, lossless): def searchTorrent(album, new=False, losslessOnly=False, albumlength=None, choose_specific_download=False): global gazelle # persistent what.cd api object to reduce number of login attempts - - # rutracker login - if headphones.CONFIG.RUTRACKER and album: - rulogin = rutracker.login(headphones.CONFIG.RUTRACKER_USER, headphones.CONFIG.RUTRACKER_PASSWORD) - if not rulogin: - logger.info(u'Could not login to rutracker, search results will exclude this provider') + global ruobj # and rutracker albumid = album['AlbumID'] reldate = album['ReleaseDate'] @@ -1110,6 +1089,68 @@ def set_proxy(proxy_url): return proxy_url + if headphones.CONFIG.TORZNAB: + provider = "torznab" + torznab_hosts = [] + + if headphones.CONFIG.TORZNAB_HOST and headphones.CONFIG.TORZNAB_ENABLED: + torznab_hosts.append((headphones.CONFIG.TORZNAB_HOST, headphones.CONFIG.TORZNAB_APIKEY, headphones.CONFIG.TORZNAB_ENABLED)) + + for torznab_host in headphones.CONFIG.get_extra_torznabs(): + if torznab_host[2] == '1' or torznab_host[2] == 1: + torznab_hosts.append(torznab_host) + + if headphones.CONFIG.PREFERRED_QUALITY == 3 or losslessOnly: + categories = "3040" + elif headphones.CONFIG.PREFERRED_QUALITY == 1 or allow_lossless: + categories = "3040,3010" + else: + categories = "3010" + + if album['Type'] == 'Other': + categories = "3030" + logger.info("Album type is audiobook/spokenword. Using audiobook category") + + for torznab_host in torznab_hosts: + + provider = torznab_host[0] + + # Request results + logger.info('Parsing results from %s using search term: %s' % (torznab_host[0],term)) + + headers = {'User-Agent': USER_AGENT} + params = { + "t": "search", + "apikey": torznab_host[1], + "cat": categories, + "maxage": headphones.CONFIG.USENET_RETENTION, + "q": term + } + + data = request.request_feed( + url=torznab_host[0] + '/api?', + params=params, headers=headers + ) + + # Process feed + if data: + if not len(data.entries): + logger.info(u"No results found from %s for %s", torznab_host[0], term) + else: + for item in data.entries: + try: + url = item.link + title = item.title + size = int(item.links[1]['length']) + if all(word.lower() in title.lower() for word in term.split()): + logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size))) + resultlist.append((title, size, url, provider, 'torrent', True)) + else: + logger.info('Skipping %s, not all search term words found' % title) + + except Exception as e: + logger.exception("An unknown error occurred trying to parse the feed: %s" % e) + if headphones.CONFIG.KAT: provider = "Kick Ass Torrents" ka_term = term.replace("!", "") @@ -1142,7 +1183,8 @@ def set_proxy(proxy_url): "field": "seeders", "sorder": "desc" } - data = request.request_json(url=providerurl, params=params) + headers = {'User-Agent': USER_AGENT} + data = request.request_json(url=providerurl, params=params, headers=headers) # Process feed if data: @@ -1158,7 +1200,7 @@ def set_proxy(proxy_url): size = int(item['size']) if format == "2": - torrent = request.request_content(url) + torrent = request.request_content(url, headers=headers) if not torrent or (int(torrent.find(".mp3")) > 0 and int(torrent.find(".flac")) < 1): rightformat = False @@ -1239,45 +1281,38 @@ def set_proxy(proxy_url): logger.error(u"An error occurred while trying to parse the response from Waffles.fm: %s", e) # rutracker.org - if headphones.CONFIG.RUTRACKER and rulogin: + if headphones.CONFIG.RUTRACKER: provider = "rutracker.org" # Ignore if release date not specified, results too unpredictable if not year and not usersearchterm: - logger.info(u'Release date not specified, ignoring for rutracker.org') + logger.info(u"Release date not specified, ignoring for rutracker.org") else: - if headphones.CONFIG.PREFERRED_QUALITY == 3 or losslessOnly: format = 'lossless' - maxsize = 10000000000 elif headphones.CONFIG.PREFERRED_QUALITY == 1 or allow_lossless: format = 'lossless+mp3' - maxsize = 10000000000 else: format = 'mp3' - maxsize = 300000000 - # build search url based on above - if not usersearchterm: - searchURL = rutracker.searchurl(artistterm, albumterm, year, format) - else: - searchURL = rutracker.searchurl(usersearchterm, ' ', ' ', format) + # Login + if not ruobj or not ruobj.logged_in(): + ruobj = rutracker.Rutracker() + if not ruobj.login(): + ruobj = None - logger.info(u'Parsing results from rutracker.org' % searchURL) + if ruobj and ruobj.logged_in(): - # parse results and get best match - rulist = rutracker.search(searchURL, maxsize, minimumseeders, albumid) + # build search url + if not usersearchterm: + searchURL = ruobj.searchurl(artistterm, albumterm, year, format) + else: + searchURL = ruobj.searchurl(usersearchterm, ' ', ' ', format) - # add best match to overall results list - if rulist: - for ru in rulist: - title = ru[0].decode('utf-8') - size = ru[1] - url = ru[2] - resultlist.append((title, size, url, provider, 'torrent', True)) - logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size))) - else: - logger.info(u"No valid results found from %s" % (provider)) + # parse results + rulist = ruobj.search(searchURL) + if rulist: + resultlist.extend(rulist) if headphones.CONFIG.WHATCD: provider = "What.cd" @@ -1491,6 +1526,57 @@ def set_proxy(proxy_url): except Exception as e: logger.error(u"An unknown error occurred in the Old Pirate Bay parser: %s" % e) + # Strike + if headphones.CONFIG.STRIKE: + provider = "Strike" + s_term = term.replace("!", "") + providerurl = fix_url("https://getstrike.net/api/v2/torrents/search/?phrase=") + + providerurl = providerurl + s_term + "&category=Music" + + if headphones.CONFIG.PREFERRED_QUALITY == 3 or losslessOnly: + format = "2" + providerurl = providerurl + "&subcategory=Lossless" + maxsize = 10000000000 + elif headphones.CONFIG.PREFERRED_QUALITY == 1 or allow_lossless: + format = "10" # MP3 and FLAC + maxsize = 10000000000 + else: + format = "8" # MP3 only + maxsize = 300000000 + + logger.info("Searching %s using term: %s" % (provider, s_term)) + data = request.request_json(url=providerurl) + + if not data or not data.get('torrents'): + logger.info("No results found on %s using search term: %s" % (provider, s_term)) + else: + for item in data['torrents']: + try: + rightformat = True + title = item['torrent_title'] + seeders = item['seeds'] + url = item['magnet_uri'] + size = int(item['size']) + subcategory = item['sub_category'] + + if format == 2: + if subcategory != "Lossless": + rightformat = False + + if rightformat and size < maxsize and minimumseeders < int(seeders): + match = True + logger.info('Found %s. Size: %s' % (title, helpers.bytes_to_mb(size))) + else: + match = False + logger.info( + '%s is larger than the maxsize, the wrong format or has too little seeders for this category, skipping. (Size: %i bytes, Seeders: %d, Format: %s)', + title, size, int(seeders), rightformat) + + resultlist.append((title, size, url, provider, 'torrent', match)) + except Exception as e: + logger.exception("Unhandled exception in the Strike parser") + # Mininova if headphones.CONFIG.MININOVA: provider = "Mininova" @@ -1567,12 +1653,14 @@ def preprocess(resultlist): for result in resultlist: if result[4] == 'torrent': + + # rutracker always needs the torrent data + if result[3] == 'rutracker.org': + return ruobj.get_torrent_data(result[2]), result + #Get out of here if we're using Transmission if headphones.CONFIG.TORRENT_DOWNLOADER == 1: ## if not a magnet link still need the .torrent to generate hash... uTorrent support labeling return True, result - # get outta here if rutracker - if result[3] == 'rutracker.org': - return True, result # Get out of here if it's a magnet link if result[2].lower().startswith("magnet:"): return True, result @@ -1581,7 +1669,8 @@ def preprocess(resultlist): headers = {} if result[3] == 'Kick Ass Torrents': - headers['Referer'] = 'http://kat.ph/' + #headers['Referer'] = 'http://kat.ph/' + headers['User-Agent'] = USER_AGENT elif result[3] == 'What.cd': headers['User-Agent'] = 'Headphones' elif result[3] == "The Pirate Bay" or result[3] == "Old Pirate Bay": diff --git a/headphones/searcher_rutracker.py b/headphones/searcher_rutracker.py deleted file mode 100644 index 0817cc501..000000000 --- a/headphones/searcher_rutracker.py +++ /dev/null @@ -1,349 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 - -# Headphones rutracker.org search -# Functions called from searcher.py - -from bencode import bencode as bencode, bdecode -from urlparse import urlparse -from bs4 import BeautifulSoup -from tempfile import mkdtemp -from hashlib import sha1 - -import headphones -import requests -import cookielib -import urllib2 -import urllib -import re -import os - -from headphones import db, logger - - -class Rutracker(): - - logged_in = False - - # Stores a number of login attempts to prevent recursion. - #login_counter = 0 - - def __init__(self): - - self.cookiejar = cookielib.CookieJar() - self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookiejar)) - urllib2.install_opener(self.opener) - - def login(self, login, password): - """Implements tracker login procedure.""" - - self.logged_in = False - - if login is None or password is None: - return False - - #self.login_counter += 1 - - # No recursion wanted. - #if self.login_counter > 1: - # return False - - params = urllib.urlencode({"login_username": login, - "login_password": password, - "login": "Вход"}) - - try: - self.opener.open("http://login.rutracker.org/forum/login.php", params) - except Exception: - pass - - # Check if we're logged in - for cookie in self.cookiejar: - if cookie.name == 'bb_data': - self.logged_in = True - - return self.logged_in - - def searchurl(self, artist, album, year, format): - """ - Return the search url - """ - - # Build search url - searchterm = '' - if artist != 'Various Artists': - searchterm = artist - searchterm = searchterm + ' ' - searchterm = searchterm + album - searchterm = searchterm + ' ' - searchterm = searchterm + year - - providerurl = "http://rutracker.org/forum/tracker.php" - - if format == 'lossless': - format = '+lossless' - elif format == 'lossless+mp3': - format = '+lossless||mp3||aac' - else: - format = '+mp3||aac' - - # sort by size, descending. - sort = '&o=7&s=2' - - searchurl = "%s?nm=%s%s%s" % (providerurl, urllib.quote(searchterm), format, sort) - - return searchurl - - def search(self, searchurl, maxsize, minseeders, albumid): - """ - Parse the search results and return valid torrent list - """ - - titles = [] - urls = [] - seeders = [] - sizes = [] - torrentlist = [] - rulist = [] - - try: - - page = self.opener.open(searchurl, timeout=60) - soup = BeautifulSoup(page.read()) - - # Debug - #logger.debug (soup.prettify()) - - # Title - for link in soup.find_all('a', attrs={'class': 'med tLink hl-tags bold'}): - title = link.get_text() - titles.append(title) - - # Download URL - for link in soup.find_all('a', attrs={'class': 'small tr-dl dl-stub'}): - url = link.get('href') - urls.append(url) - - # Seeders - for link in soup.find_all('b', attrs={'class': 'seedmed'}): - seeder = link.get_text() - seeders.append(seeder) - - # Size - for link in soup.find_all('td', attrs={'class': 'row4 small nowrap tor-size'}): - size = link.u.string - sizes.append(size) - - except: - pass - - # Combine lists - torrentlist = zip(titles, urls, seeders, sizes) - - # return if nothing found - if not torrentlist: - return False - - # don't bother checking track counts anymore, let searcher filter instead - # leave code in just in case - check_track_count = False - - if check_track_count: - - # get headphones track count for album, return if not found - myDB = db.DBConnection() - tracks = myDB.select('SELECT * from tracks WHERE AlbumID=?', [albumid]) - hptrackcount = len(tracks) - - if not hptrackcount: - logger.info('headphones track info not found, cannot compare to torrent') - return False - - # Return all valid entries, ignored, required words now checked in searcher.py - - #unwantedlist = ['promo', 'vinyl', '[lp]', 'songbook', 'tvrip', 'hdtv', 'dvd'] - - formatlist = ['ape', 'flac', 'ogg', 'm4a', 'aac', 'mp3', 'wav', 'aif'] - deluxelist = ['deluxe', 'edition', 'japanese', 'exclusive'] - - for torrent in torrentlist: - - returntitle = torrent[0].encode('utf-8') - url = torrent[1] - seeders = torrent[2] - size = torrent[3] - - if int(size) <= maxsize and int(seeders) >= minseeders: - - #Torrent topic page - torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t'] - topicurl = 'http://rutracker.org/forum/viewtopic.php?t=' + torrent_id - - # add to list - if not check_track_count: - valid = True - else: - - # Check torrent info - self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)) - - # Debug - #for cookie in self.cookiejar: - # logger.debug ('Cookie: %s' % cookie) - - try: - page = self.opener.open(url) - torrent = page.read() - if torrent: - decoded = bdecode(torrent) - metainfo = decoded['info'] - page.close() - except Exception as e: - logger.error('Error getting torrent: %s' % e) - return False - - # get torrent track count and check for cue - trackcount = 0 - cuecount = 0 - - if 'files' in metainfo: # multi - for pathfile in metainfo['files']: - path = pathfile['path'] - for file in path: - if any(file.lower().endswith('.' + x.lower()) for x in formatlist): - trackcount += 1 - if '.cue' in file: - cuecount += 1 - - title = returntitle.lower() - logger.debug('torrent title: %s' % title) - logger.debug('headphones trackcount: %s' % hptrackcount) - logger.debug('rutracker trackcount: %s' % trackcount) - - # If torrent track count less than headphones track count, and there's a cue, then attempt to get track count from log(s) - # This is for the case where we have a single .flac/.wav which can be split by cue - # Not great, but shouldn't be doing this too often - totallogcount = 0 - if trackcount < hptrackcount and cuecount > 0 and cuecount < hptrackcount: - page = self.opener.open(topicurl, timeout=60) - soup = BeautifulSoup(page.read()) - findtoc = soup.find_all(text='TOC of the extracted CD') - if not findtoc: - findtoc = soup.find_all(text='TOC извлечённого CD') - for toc in findtoc: - logcount = 0 - for toccontent in toc.find_all_next(text=True): - cut_string = toccontent.split('|') - new_string = cut_string[0].lstrip().rstrip() - if new_string == '1' or new_string == '01': - logcount = 1 - elif logcount > 0: - if new_string.isdigit(): - logcount += 1 - else: - break - totallogcount = totallogcount + logcount - - if totallogcount > 0: - trackcount = totallogcount - logger.debug('rutracker logtrackcount: %s' % totallogcount) - - # If torrent track count = hp track count then return torrent, - # if greater, check for deluxe/special/foreign editions - # if less, then allow if it's a single track with a cue - valid = False - - if trackcount == hptrackcount: - valid = True - elif trackcount > hptrackcount: - if any(deluxe in title for deluxe in deluxelist): - valid = True - - # Add to list - if valid: - rulist.append((returntitle, size, topicurl)) - else: - if topicurl: - logger.info(u'Torrent found with %s tracks but the selected headphones release has %s tracks, skipping for rutracker.org' % (topicurl, trackcount, hptrackcount)) - else: - logger.info('%s is larger than the maxsize or has too little seeders for this category, skipping. (Size: %i bytes, Seeders: %i)' % (returntitle, int(size), int(seeders))) - - return rulist - - def get_torrent(self, url, savelocation=None): - - torrent_id = dict([part.split('=') for part in urlparse(url)[4].split('&')])['t'] - self.cookiejar.set_cookie(cookielib.Cookie(version=0, name='bb_dl', value=torrent_id, port=None, port_specified=False, domain='.rutracker.org', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)) - downloadurl = 'http://dl.rutracker.org/forum/dl.php?t=' + torrent_id - torrent_name = torrent_id + '.torrent' - - try: - prev = os.umask(headphones.UMASK) - page = self.opener.open(downloadurl) - torrent = page.read() - decoded = bdecode(torrent) - metainfo = decoded['info'] - tor_hash = sha1(bencode(metainfo)).hexdigest() - if savelocation: - download_path = os.path.join(savelocation, torrent_name) - else: - tempdir = mkdtemp(suffix='_rutracker_torrents') - download_path = os.path.join(tempdir, torrent_name) - - with open(download_path, 'wb') as f: - f.write(torrent) - os.umask(prev) - - # Add file to utorrent - if headphones.CONFIG.TORRENT_DOWNLOADER == 2: - self.utorrent_add_file(download_path) - - except Exception as e: - logger.error('Error getting torrent: %s', e) - return False - - return download_path, tor_hash - - #TODO get this working in utorrent.py - def utorrent_add_file(self, filename): - - host = headphones.CONFIG.UTORRENT_HOST - if not host.startswith('http'): - host = 'http://' + host - if host.endswith('/'): - host = host[:-1] - if host.endswith('/gui'): - host = host[:-4] - - base_url = host - username = headphones.CONFIG.UTORRENT_USERNAME - password = headphones.CONFIG.UTORRENT_PASSWORD - - session = requests.Session() - url = base_url + '/gui/' - session.auth = (username, password) - - try: - r = session.get(url + 'token.html') - except Exception: - logger.exception('Error getting token') - return - - if r.status_code == '401': - logger.debug('Error reaching utorrent') - return - - regex = re.search(r'.+>([^<]+)
', r.text) - if regex is None: - logger.debug('Error reading token') - return - - session.params = {'token': regex.group(1)} - - with open(filename, 'rb') as f: - try: - session.post(url, params={'action': 'add-file'}, - files={'torrent_file': f}) - except Exception: - logger.exception('Error adding file to utorrent') - return diff --git a/headphones/transmission.py b/headphones/transmission.py index ec5a9f601..990360d00 100644 --- a/headphones/transmission.py +++ b/headphones/transmission.py @@ -28,12 +28,15 @@ # Store torrent id so we can check up on it -def addTorrent(link): +def addTorrent(link, data=None): method = 'torrent-add' - if link.endswith('.torrent'): - with open(link, 'rb') as f: - metainfo = str(base64.b64encode(f.read())) + if link.endswith('.torrent') or data: + if data: + metainfo = str(base64.b64encode(data)) + else: + with open(link, 'rb') as f: + metainfo = str(base64.b64encode(f.read())) arguments = {'metainfo': metainfo, 'download-dir': headphones.CONFIG.DOWNLOAD_TORRENT_DIR} else: arguments = {'filename': link, 'download-dir': headphones.CONFIG.DOWNLOAD_TORRENT_DIR} diff --git a/headphones/utorrent.py b/headphones/utorrent.py index 352ac72b4..08d20ce15 100644 --- a/headphones/utorrent.py +++ b/headphones/utorrent.py @@ -220,7 +220,7 @@ def dirTorrent(hash, cacheid=None, return_name=None): cacheid = torrentList['torrentc'] for torrent in torrents: - if torrent[0].upper() == hash: + if torrent[0].upper() == hash.upper(): if not return_name: return torrent[26], cacheid else: @@ -228,8 +228,12 @@ def dirTorrent(hash, cacheid=None, return_name=None): return None, None +def addTorrent(link): + uTorrentClient = utorrentclient() + uTorrentClient.add_url(link) + -def addTorrent(link, hash): +def getFolder(hash): uTorrentClient = utorrentclient() # Get Active Directory from settings @@ -239,8 +243,6 @@ def addTorrent(link, hash): logger.error('Could not get "Put new downloads in:" directory from uTorrent settings, please ensure it is set') return None - uTorrentClient.add_url(link) - # Get Torrent Folder Name torrent_folder, cacheid = dirTorrent(hash) @@ -254,10 +256,8 @@ def addTorrent(link, hash): if torrent_folder == active_dir or not torrent_folder: torrent_folder, cacheid = dirTorrent(hash, cacheid, return_name=True) - labelTorrent(hash) return torrent_folder else: - labelTorrent(hash) if headphones.SYS_PLATFORM != "win32": torrent_folder = torrent_folder.replace('\\', '/') return os.path.basename(os.path.normpath(torrent_folder)) diff --git a/headphones/webserve.py b/headphones/webserve.py index b247e0675..121e8667c 100644 --- a/headphones/webserve.py +++ b/headphones/webserve.py @@ -232,11 +232,11 @@ def resumeArtist(self, ArtistID): raise cherrypy.HTTPRedirect("artistPage?ArtistID=%s" % ArtistID) def removeArtist(self, ArtistID): - logger.info(u"Deleting all traces of artist: " + ArtistID) myDB = db.DBConnection() namecheck = myDB.select('SELECT ArtistName from artists where ArtistID=?', [ArtistID]) for name in namecheck: artistname = name['ArtistName'] + logger.info(u"Deleting all traces of artist: " + artistname) myDB.action('DELETE from artists WHERE ArtistID=?', [ArtistID]) from headphones import cache @@ -274,7 +274,7 @@ def scanArtist(self, ArtistID): logger.info(u"Scanning artist: %s", artist_name) full_folder_format = headphones.CONFIG.FOLDER_FORMAT - folder_format = re.findall(r'(.*[Aa]rtist?)\.*', full_folder_format)[0] + folder_format = re.findall(r'(.*?[Aa]rtist?)\.*', full_folder_format)[0] acceptable_formats = ["$artist","$sortartist","$first/$artist","$first/$sortartist"] @@ -802,7 +802,7 @@ def forceSearch(self): @cherrypy.expose def forcePostProcess(self, dir=None, album_dir=None, keep_original_folder=False): from headphones import postprocessor - threading.Thread(target=postprocessor.forcePostProcess, kwargs={'dir': dir, 'album_dir': album_dir, 'keep_original_folder':keep_original_folder}).start() + threading.Thread(target=postprocessor.forcePostProcess, kwargs={'dir': dir, 'album_dir': album_dir, 'keep_original_folder':keep_original_folder == 'True'}).start() raise cherrypy.HTTPRedirect("home") @cherrypy.expose @@ -1065,6 +1065,11 @@ def config(self): "newznab_apikey": headphones.CONFIG.NEWZNAB_APIKEY, "newznab_enabled": checked(headphones.CONFIG.NEWZNAB_ENABLED), "extra_newznabs": headphones.CONFIG.get_extra_newznabs(), + "use_torznab": checked(headphones.CONFIG.TORZNAB), + "torznab_host": headphones.CONFIG.TORZNAB_HOST, + "torznab_apikey": headphones.CONFIG.TORZNAB_APIKEY, + "torznab_enabled": checked(headphones.CONFIG.TORZNAB_ENABLED), + "extra_torznabs": headphones.CONFIG.get_extra_torznabs(), "use_nzbsorg": checked(headphones.CONFIG.NZBSORG), "nzbsorg_uid": headphones.CONFIG.NZBSORG_UID, "nzbsorg_hash": headphones.CONFIG.NZBSORG_HASH, @@ -1101,6 +1106,8 @@ def config(self): "whatcd_username": headphones.CONFIG.WHATCD_USERNAME, "whatcd_password": headphones.CONFIG.WHATCD_PASSWORD, "whatcd_ratio": headphones.CONFIG.WHATCD_RATIO, + "use_strike": checked(headphones.CONFIG.STRIKE), + "strike_ratio": headphones.CONFIG.STRIKE_RATIO, "pref_qual_0": radio(headphones.CONFIG.PREFERRED_QUALITY, 0), "pref_qual_1": radio(headphones.CONFIG.PREFERRED_QUALITY, 1), "pref_qual_2": radio(headphones.CONFIG.PREFERRED_QUALITY, 2), @@ -1138,6 +1145,7 @@ def config(self): "autowant_upcoming": checked(headphones.CONFIG.AUTOWANT_UPCOMING), "autowant_all": checked(headphones.CONFIG.AUTOWANT_ALL), "autowant_manually_added": checked(headphones.CONFIG.AUTOWANT_MANUALLY_ADDED), + "do_not_process_unmatched": checked(headphones.CONFIG.DO_NOT_PROCESS_UNMATCHED), "keep_torrent_files": checked(headphones.CONFIG.KEEP_TORRENT_FILES), "prefer_torrents_0": radio(headphones.CONFIG.PREFER_TORRENTS, 0), "prefer_torrents_1": radio(headphones.CONFIG.PREFER_TORRENTS, 1), @@ -1278,12 +1286,12 @@ def configUpdate(self, **kwargs): # Handle the variable config options. Note - keys with False values aren't getting passed checked_configs = [ - "launch_browser", "enable_https", "api_enabled", "use_blackhole", "headphones_indexer", "use_newznab", "newznab_enabled", + "launch_browser", "enable_https", "api_enabled", "use_blackhole", "headphones_indexer", "use_newznab", "newznab_enabled", "use_torznab", "torznab_enabled", "use_nzbsorg", "use_omgwtfnzbs", "use_kat", "use_piratebay", "use_oldpiratebay", "use_mininova", "use_waffles", "use_rutracker", - "use_whatcd", "preferred_bitrate_allow_lossless", "detect_bitrate", "ignore_clean_releases", "freeze_db", "cue_split", "move_files", - "rename_files", "correct_metadata", "cleanup_files", "keep_nfo", "add_album_art", "embed_album_art", "embed_lyrics", - "replace_existing_folders", "keep_original_folder", "file_underscores", "include_extras", "official_releases_only", - "wait_until_release_date", "autowant_upcoming", "autowant_all", "autowant_manually_added", "keep_torrent_files", "music_encoder", + "use_whatcd", "use_strike", "preferred_bitrate_allow_lossless", "detect_bitrate", "ignore_clean_releases", "freeze_db", "cue_split", "move_files", + "rename_files", "correct_metadata", "cleanup_files", "keep_nfo", "add_album_art", "embed_album_art", "embed_lyrics", + "replace_existing_folders", "keep_original_folder", "file_underscores", "include_extras", "official_releases_only", + "wait_until_release_date", "autowant_upcoming", "autowant_all", "autowant_manually_added", "do_not_process_unmatched", "keep_torrent_files", "music_encoder", "encoderlossless", "encoder_multicore", "delete_lossless_files", "growl_enabled", "growl_onsnatch", "prowl_enabled", "prowl_onsnatch", "xbmc_enabled", "xbmc_update", "xbmc_notify", "lms_enabled", "plex_enabled", "plex_update", "plex_notify", "nma_enabled", "nma_onsnatch", "pushalot_enabled", "pushalot_onsnatch", "synoindex_enabled", "pushover_enabled", @@ -1316,6 +1324,21 @@ def configUpdate(self, **kwargs): del kwargs[key] extra_newznabs.append((newznab_host, newznab_api, newznab_enabled)) + extra_torznabs = [] + for kwarg in [x for x in kwargs if x.startswith('torznab_host')]: + torznab_host_key = kwarg + torznab_number = kwarg[12:] + if len(torznab_number): + torznab_api_key = 'torznab_api' + torznab_number + torznab_enabled_key = 'torznab_enabled' + torznab_number + torznab_host = kwargs.get(torznab_host_key, '') + torznab_api = kwargs.get(torznab_api_key, '') + torznab_enabled = int(kwargs.get(torznab_enabled_key, 0)) + for key in [torznab_host_key, torznab_api_key, torznab_enabled_key]: + if key in kwargs: + del kwargs[key] + extra_torznabs.append((torznab_host, torznab_api, torznab_enabled)) + # Convert the extras to list then string. Coming in as 0 or 1 (append new extras to the end) temp_extras_list = [] @@ -1341,11 +1364,18 @@ def configUpdate(self, **kwargs): del kwargs[extra] headphones.CONFIG.EXTRAS = ','.join(str(n) for n in temp_extras_list) + headphones.CONFIG.clear_extra_newznabs() + headphones.CONFIG.clear_extra_torznabs() + headphones.CONFIG.process_kwargs(kwargs) + for extra_newznab in extra_newznabs: headphones.CONFIG.add_extra_newznab(extra_newznab) + for extra_torznab in extra_torznabs: + headphones.CONFIG.add_extra_torznab(extra_torznab) + # Sanity checking if headphones.CONFIG.SEARCH_INTERVAL and headphones.CONFIG.SEARCH_INTERVAL < 360: logger.info("Search interval too low. Resetting to 6 hour minimum") diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py index 7ba34269a..d35f765bc 100644 --- a/lib/bs4/__init__.py +++ b/lib/bs4/__init__.py @@ -17,8 +17,8 @@ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.3.2" -__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" +__version__ = "4.4.0" +__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] @@ -45,7 +45,7 @@ # The very first thing we do is give a useful error if someone is # running this code under Python 3 without converting it. -syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' class BeautifulSoup(Tag): """ @@ -77,8 +77,11 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" + def __init__(self, markup="", features=None, builder=None, - parse_only=None, from_encoding=None, **kwargs): + parse_only=None, from_encoding=None, exclude_encodings=None, + **kwargs): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" @@ -114,9 +117,9 @@ def __init__(self, markup="", features=None, builder=None, del kwargs['isHTML'] warnings.warn( "BS4 does not respect the isHTML argument to the " - "BeautifulSoup constructor. You can pass in features='html' " - "or features='xml' to get a builder capable of handling " - "one or the other.") + "BeautifulSoup constructor. Suggest you use " + "features='lxml' for HTML and features='lxml-xml' for " + "XML.") def deprecated_argument(old_name, new_name): if old_name in kwargs: @@ -140,6 +143,7 @@ def deprecated_argument(old_name, new_name): "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: + original_features = features if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: @@ -151,6 +155,16 @@ def deprecated_argument(old_name, new_name): "requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() + if not (original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES): + if builder.is_xml: + markup_type = "XML" + else: + markup_type = "HTML" + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( + parser=builder.NAME, + markup_type=markup_type)) + self.builder = builder self.is_xml = builder.is_xml self.builder.soup = self @@ -178,6 +192,8 @@ def deprecated_argument(old_name, new_name): # system. Just let it go. pass if is_file: + if isinstance(markup, unicode): + markup = markup.encode("utf8") warnings.warn( '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) if markup[:5] == "http:" or markup[:6] == "https:": @@ -185,12 +201,15 @@ def deprecated_argument(old_name, new_name): # Python 3 otherwise. if ((isinstance(markup, bytes) and not b' ' in markup) or (isinstance(markup, unicode) and not u' ' in markup)): + if isinstance(markup, unicode): + markup = markup.encode("utf8") warnings.warn( '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) for (self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( - self.builder.prepare_markup(markup, from_encoding)): + self.builder.prepare_markup( + markup, from_encoding, exclude_encodings=exclude_encodings)): self.reset() try: self._feed() @@ -203,6 +222,16 @@ def deprecated_argument(old_name, new_name): self.markup = None self.builder.soup = None + def __copy__(self): + return type(self)(self.encode(), builder=self.builder) + + def __getstate__(self): + # Frequently a tree builder can't be pickled. + d = dict(self.__dict__) + if 'builder' in d and not self.builder.picklable: + del d['builder'] + return d + def _feed(self): # Convert the document to Unicode. self.builder.reset() @@ -229,9 +258,7 @@ def new_tag(self, name, namespace=None, nsprefix=None, **attrs): def new_string(self, s, subclass=NavigableString): """Create a new NavigableString associated with this soup.""" - navigable = subclass(s) - navigable.setup() - return navigable + return subclass(s) def insert_before(self, successor): raise NotImplementedError("BeautifulSoup objects don't support insert_before().") @@ -290,14 +317,49 @@ def endData(self, containerClass=NavigableString): def object_was_parsed(self, o, parent=None, most_recent_element=None): """Add an object to the parse tree.""" parent = parent or self.currentTag - most_recent_element = most_recent_element or self._most_recent_element - o.setup(parent, most_recent_element) + previous_element = most_recent_element or self._most_recent_element + + next_element = previous_sibling = next_sibling = None + if isinstance(o, Tag): + next_element = o.next_element + next_sibling = o.next_sibling + previous_sibling = o.previous_sibling + if not previous_element: + previous_element = o.previous_element + + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) - if most_recent_element is not None: - most_recent_element.next_element = o self._most_recent_element = o parent.contents.append(o) + if parent.next_sibling: + # This node is being inserted into an element that has + # already been parsed. Deal with any dangling references. + index = parent.contents.index(o) + if index == 0: + previous_element = parent + previous_sibling = None + else: + previous_element = previous_sibling = parent.contents[index-1] + if index == len(parent.contents)-1: + next_element = parent.next_sibling + next_sibling = None + else: + next_element = next_sibling = parent.contents[index+1] + + o.previous_element = previous_element + if previous_element: + previous_element.next_element = o + o.next_element = next_element + if next_element: + next_element.previous_element = o + o.next_sibling = next_sibling + if next_sibling: + next_sibling.previous_sibling = o + o.previous_sibling = previous_sibling + if previous_sibling: + previous_sibling.next_sibling = o + def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py index 740f5f29c..f8fce5681 100644 --- a/lib/bs4/builder/__init__.py +++ b/lib/bs4/builder/__init__.py @@ -80,9 +80,12 @@ def lookup(self, *features): class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" + NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] features = [] is_xml = False + picklable = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py index d46b695bd..ab5793c18 100644 --- a/lib/bs4/builder/_html5lib.py +++ b/lib/bs4/builder/_html5lib.py @@ -2,6 +2,7 @@ 'HTML5TreeBuilder', ] +from pdb import set_trace import warnings from bs4.builder import ( PERMISSIVE, @@ -9,7 +10,10 @@ HTML_5, HTMLTreeBuilder, ) -from bs4.element import NamespacedAttribute +from bs4.element import ( + NamespacedAttribute, + whitespace_re, +) import html5lib from html5lib.constants import namespaces from bs4.element import ( @@ -22,11 +26,20 @@ class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" - features = ['html5lib', PERMISSIVE, HTML_5, HTML] + NAME = "html5lib" + + features = [NAME, PERMISSIVE, HTML_5, HTML] - def prepare_markup(self, markup, user_specified_encoding): + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): # Store the user-specified encoding for use later on. self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") yield (markup, None, None, False) # These methods are defined by Beautiful Soup. @@ -101,7 +114,13 @@ def __init__(self, element): def __iter__(self): return list(self.attrs.items()).__iter__() def __setitem__(self, name, value): - "set attr", name, value + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = HTML5TreeBuilder.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + value = whitespace_re.split(value) self.element[name] = value def items(self): return list(self.attrs.items()) @@ -161,6 +180,12 @@ def appendChild(self, node): # immediately after the parent, if it has no children.) if self.element.contents: most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() else: most_recent_element = self.element @@ -172,6 +197,7 @@ def getAttributes(self): return AttrList(self.element) def setAttributes(self, attributes): + if attributes is not None and len(attributes) > 0: converted_attributes = [] @@ -218,6 +244,9 @@ def removeChild(self, node): def reparentChildren(self, new_parent): """Move all of this tag's children into another tag.""" + # print "MOVE", self.element.contents + # print "FROM", self.element + # print "TO", new_parent.element element = self.element new_parent_element = new_parent.element # Determine what this tag's next_element will be once all the children @@ -236,17 +265,28 @@ def reparentChildren(self, new_parent): new_parents_last_descendant_next_element = new_parent_element.next_element to_append = element.contents - append_after = new_parent.element.contents + append_after = new_parent_element.contents if len(to_append) > 0: # Set the first child's previous_element and previous_sibling # to elements within the new parent first_child = to_append[0] - first_child.previous_element = new_parents_last_descendant + if new_parents_last_descendant: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child: + new_parents_last_child.next_sibling = first_child # Fix the last child's next_element and next_sibling last_child = to_append[-1] last_child.next_element = new_parents_last_descendant_next_element + if new_parents_last_descendant_next_element: + new_parents_last_descendant_next_element.previous_element = last_child last_child.next_sibling = None for child in to_append: @@ -257,6 +297,10 @@ def reparentChildren(self, new_parent): element.contents = [] element.next_element = final_next_element + # print "DONE WITH MOVE" + # print "FROM", self.element + # print "TO", new_parent_element + def cloneNode(self): tag = self.soup.new_tag(self.element.name, self.namespace) node = Element(tag, self.soup, self.namespace) @@ -268,7 +312,7 @@ def hasContent(self): return self.element.contents def getNameTuple(self): - if self.namespace is None: + if self.namespace == None: return namespaces["html"], self.name else: return self.namespace, self.name diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py index ca8d8b892..0101d647b 100644 --- a/lib/bs4/builder/_htmlparser.py +++ b/lib/bs4/builder/_htmlparser.py @@ -4,10 +4,16 @@ 'HTMLParserTreeBuilder', ] -from HTMLParser import ( - HTMLParser, - HTMLParseError, - ) +from HTMLParser import HTMLParser + +try: + from HTMLParser import HTMLParseError +except ImportError, e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): + pass + import sys import warnings @@ -19,10 +25,10 @@ # At the end of this file, we monkeypatch HTMLParser so that # strict=True works well on Python 3.2.2. major, minor, release = sys.version_info[:3] -CONSTRUCTOR_TAKES_STRICT = ( - major > 3 - or (major == 3 and minor > 2) - or (major == 3 and minor == 2 and release >= 3)) +CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 +CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 + from bs4.element import ( CData, @@ -63,7 +69,8 @@ def handle_data(self, data): def handle_charref(self, name): # XXX workaround for a bug in HTMLParser. Remove this once - # it's fixed. + # it's fixed in all supported versions. + # http://bugs.python.org/issue13633 if name.startswith('x'): real_name = int(name.lstrip('x'), 16) elif name.startswith('X'): @@ -113,14 +120,6 @@ def unknown_decl(self, data): def handle_pi(self, data): self.soup.endData() - if data.endswith("?") and data.lower().startswith("xml"): - # "An XHTML processing instruction using the trailing '?' - # will cause the '?' to be included in data." - HTMLParser - # docs. - # - # Strip the question mark so we don't end up with two - # question marks. - data = data[:-1] self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) @@ -128,15 +127,19 @@ def handle_pi(self, data): class HTMLParserTreeBuilder(HTMLTreeBuilder): is_xml = False - features = [HTML, STRICT, HTMLPARSER] + picklable = True + NAME = HTMLPARSER + features = [NAME, HTML, STRICT] def __init__(self, *args, **kwargs): - if CONSTRUCTOR_TAKES_STRICT: + if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: kwargs['strict'] = False + if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: + kwargs['convert_charrefs'] = False self.parser_args = (args, kwargs) def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): + document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be @@ -147,7 +150,8 @@ def prepare_markup(self, markup, user_specified_encoding=None, return try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) yield (dammit.markup, dammit.original_encoding, dammit.declared_html_encoding, dammit.contains_replacement_characters) diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py index fa5d49875..9e8f88fb5 100644 --- a/lib/bs4/builder/_lxml.py +++ b/lib/bs4/builder/_lxml.py @@ -7,7 +7,12 @@ from StringIO import StringIO import collections from lxml import etree -from bs4.element import Comment, Doctype, NamespacedAttribute +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, +) from bs4.builder import ( FAST, HTML, @@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder): is_xml = True + NAME = "lxml-xml" + ALTERNATE_NAMES = ["xml"] + # Well, it's permissive by XML parser standards. - features = [LXML, XML, FAST, PERMISSIVE] + features = [NAME, LXML, XML, FAST, PERMISSIVE] CHUNK_SIZE = 512 @@ -70,6 +78,7 @@ def _getNsTag(self, tag): return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. @@ -95,7 +104,8 @@ def prepare_markup(self, markup, user_specified_encoding=None, # the document as each one in turn. is_html = not self.is_xml try_encodings = [user_specified_encoding, document_declared_encoding] - detector = EncodingDetector(markup, try_encodings, is_html) + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False) @@ -189,7 +199,9 @@ def end(self, name): self.nsmaps.pop() def pi(self, target, data): - pass + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(ProcessingInstruction) def data(self, content): self.soup.handle_data(content) @@ -212,7 +224,10 @@ def test_fragment_to_document(self, fragment): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - features = [LXML, HTML, FAST, PERMISSIVE] + NAME = LXML + ALTERNATE_NAMES = ["lxml-html"] + + features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] is_xml = False def default_parser(self, encoding): diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py index 32e211dc4..317ad6d74 100644 --- a/lib/bs4/dammit.py +++ b/lib/bs4/dammit.py @@ -3,10 +3,11 @@ This library converts a bytestream to Unicode through any means necessary. It is heavily based on code from Mark Pilgrim's Universal -Feed Parser. It works best on XML and XML, but it does not rewrite the +Feed Parser. It works best on XML and HTML, but it does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ +from pdb import set_trace import codecs from htmlentitydefs import codepoint2name import re @@ -212,8 +213,11 @@ class EncodingDetector: 5. Windows-1252. """ - def __init__(self, markup, override_encodings=None, is_html=False): + def __init__(self, markup, override_encodings=None, is_html=False, + exclude_encodings=None): self.override_encodings = override_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) self.chardet_encoding = None self.is_html = is_html self.declared_encoding = None @@ -224,6 +228,8 @@ def __init__(self, markup, override_encodings=None, is_html=False): def _usable(self, encoding, tried): if encoding is not None: encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False if encoding not in tried: tried.add(encoding) return True @@ -266,6 +272,9 @@ def encodings(self): def strip_byte_order_mark(cls, data): """If a byte-order mark is present, strip it and return the encoding it implies.""" encoding = None + if isinstance(data, unicode): + # Unicode data cannot have a byte-order mark. + return data, encoding if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ and (data[2:4] != '\x00\x00'): encoding = 'utf-16be' @@ -299,14 +308,14 @@ def find_declared_encoding(cls, markup, is_html=False, search_entire_document=Fa else: xml_endpos = 1024 html_endpos = max(2048, int(len(markup) * 0.05)) - + declared_encoding = None declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) if not declared_encoding_match and is_html: declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) if declared_encoding_match is not None: declared_encoding = declared_encoding_match.groups()[0].decode( - 'ascii') + 'ascii', 'replace') if declared_encoding: return declared_encoding.lower() return None @@ -331,13 +340,14 @@ class UnicodeDammit: ] def __init__(self, markup, override_encodings=[], - smart_quotes_to=None, is_html=False): + smart_quotes_to=None, is_html=False, exclude_encodings=[]): self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False self.is_html = is_html - self.detector = EncodingDetector(markup, override_encodings, is_html) + self.detector = EncodingDetector( + markup, override_encodings, is_html, exclude_encodings) # Short-circuit if the data is in Unicode to begin with. if isinstance(markup, unicode) or markup == '': diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py index b7c99b1c5..1b719830b 100644 --- a/lib/bs4/diagnose.py +++ b/lib/bs4/diagnose.py @@ -33,12 +33,21 @@ def diagnose(data): if 'lxml' in basic_parsers: basic_parsers.append(["lxml", "xml"]) - from lxml import etree - print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) + try: + from lxml import etree + print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) + except ImportError, e: + print ( + "lxml is not installed or couldn't be imported.") + if 'html5lib' in basic_parsers: - import html5lib - print "Found html5lib version %s" % html5lib.__version__ + try: + import html5lib + print "Found html5lib version %s" % html5lib.__version__ + except ImportError, e: + print ( + "html5lib is not installed or couldn't be imported.") if hasattr(data, 'read'): data = data.read() @@ -135,7 +144,7 @@ def rword(length=5): def rsentence(length=4): "Generate a random sentence-like string." return " ".join(rword(random.randint(4,9)) for i in range(length)) - + def rdoc(num_elements=1000): """Randomly generate an invalid HTML document.""" tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] @@ -159,7 +168,7 @@ def benchmark_parsers(num_elements=100000): print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) - + for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: diff --git a/lib/bs4/element.py b/lib/bs4/element.py index da9afdf48..c70ad5a01 100644 --- a/lib/bs4/element.py +++ b/lib/bs4/element.py @@ -1,3 +1,4 @@ +from pdb import set_trace import collections import re import sys @@ -185,24 +186,40 @@ def _formatter_for_name(self, name): return self.HTML_FORMATTERS.get( name, HTMLAwareEntitySubstitution.substitute_xml) - def setup(self, parent=None, previous_element=None): + def setup(self, parent=None, previous_element=None, next_element=None, + previous_sibling=None, next_sibling=None): """Sets up the initial relations between this element and other elements.""" self.parent = parent + self.previous_element = previous_element if previous_element is not None: self.previous_element.next_element = self - self.next_element = None - self.previous_sibling = None - self.next_sibling = None - if self.parent is not None and self.parent.contents: - self.previous_sibling = self.parent.contents[-1] + + self.next_element = next_element + if self.next_element: + self.next_element.previous_element = self + + self.next_sibling = next_sibling + if self.next_sibling: + self.next_sibling.previous_sibling = self + + if (not previous_sibling + and self.parent is not None and self.parent.contents): + previous_sibling = self.parent.contents[-1] + + self.previous_sibling = previous_sibling + if previous_sibling: self.previous_sibling.next_sibling = self nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 def replace_with(self, replace_with): + if not self.parent: + raise ValueError( + "Cannot replace one element with another when the" + "element to be replaced is not part of a tree.") if replace_with is self: return if replace_with is self.parent: @@ -216,6 +233,10 @@ def replace_with(self, replace_with): def unwrap(self): my_parent = self.parent + if not self.parent: + raise ValueError( + "Cannot replace an element with its contents when that" + "element is not part of a tree.") my_index = self.parent.index(self) self.extract() for child in reversed(self.contents[:]): @@ -240,17 +261,20 @@ def extract(self): last_child = self._last_descendant() next_element = last_child.next_element - if self.previous_element is not None: + if (self.previous_element is not None and + self.previous_element != next_element): self.previous_element.next_element = next_element - if next_element is not None: + if next_element is not None and next_element != self.previous_element: next_element.previous_element = self.previous_element self.previous_element = None last_child.next_element = None self.parent = None - if self.previous_sibling is not None: + if (self.previous_sibling is not None + and self.previous_sibling != self.next_sibling): self.previous_sibling.next_sibling = self.next_sibling - if self.next_sibling is not None: + if (self.next_sibling is not None + and self.next_sibling != self.previous_sibling): self.next_sibling.previous_sibling = self.previous_sibling self.previous_sibling = self.next_sibling = None return self @@ -478,6 +502,10 @@ def _find_one(self, method, name, attrs, text, **kwargs): def _find_all(self, name, attrs, text, limit, generator, **kwargs): "Iterates over a generator looking for things that match." + if text is None and 'string' in kwargs: + text = kwargs['string'] + del kwargs['string'] + if isinstance(name, SoupStrainer): strainer = name else: @@ -548,17 +576,17 @@ def parents(self): # Methods for supporting CSS selectors. - tag_name_re = re.compile('^[a-z0-9]+$') + tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') - # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ - # \---/ \---/\-------------/ \-------/ - # | | | | - # | | | The value - # | | ~,|,^,$,* or = - # | Attribute + # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ + # \---------------------------/ \---/\-------------/ \-------/ + # | | | | + # | | | The value + # | | ~,|,^,$,* or = + # | Attribute # Tag attribselect_re = re.compile( - r'^(?P\w+)?\[(?P\w+)(?P[=~\|\^\$\*]?)' + + r'^(?P[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P[\w-]+)(?P[=~\|\^\$\*]?)' + r'=?"?(?P[^\]"]*)"?\]$' ) @@ -654,11 +682,17 @@ def __new__(cls, value): how to handle non-ASCII characters. """ if isinstance(value, unicode): - return unicode.__new__(cls, value) - return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u = unicode.__new__(cls, value) + else: + u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u.setup() + return u def __copy__(self): - return self + """A copy of a NavigableString has the same contents and class + as the original, but it is not connected to the parse tree. + """ + return type(self)(self) def __getnewargs__(self): return (unicode(self),) @@ -707,7 +741,7 @@ class CData(PreformattedString): class ProcessingInstruction(PreformattedString): PREFIX = u'' + SUFFIX = u'>' class Comment(PreformattedString): @@ -759,9 +793,12 @@ def __init__(self, parser=None, builder=None, name=None, namespace=None, self.prefix = prefix if attrs is None: attrs = {} - elif attrs and builder.cdata_list_attributes: - attrs = builder._replace_cdata_list_attribute_values( - self.name, attrs) + elif attrs: + if builder is not None and builder.cdata_list_attributes: + attrs = builder._replace_cdata_list_attribute_values( + self.name, attrs) + else: + attrs = dict(attrs) else: attrs = dict(attrs) self.attrs = attrs @@ -778,6 +815,18 @@ def __init__(self, parser=None, builder=None, name=None, namespace=None, parserClass = _alias("parser_class") # BS3 + def __copy__(self): + """A copy of a Tag is a new Tag, unconnected to the parse tree. + Its contents are a copy of the old Tag's contents. + """ + clone = type(self)(None, self.builder, self.name, self.namespace, + self.nsprefix, self.attrs) + for attr in ('can_be_empty_element', 'hidden'): + setattr(clone, attr, getattr(self, attr)) + for child in self.contents: + clone.append(child.__copy__()) + return clone + @property def is_empty_element(self): """Is this tag an empty-element tag? (aka a self-closing tag) @@ -971,15 +1020,25 @@ def __ne__(self, other): as defined in __eq__.""" return not self == other - def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): + def __repr__(self, encoding="unicode-escape"): """Renders this tag as a string.""" - return self.encode(encoding) + if PY3K: + # "The return value must be a string object", i.e. Unicode + return self.decode() + else: + # "The return value must be a string object", i.e. a bytestring. + # By convention, the return value of __repr__ should also be + # an ASCII string. + return self.encode(encoding) def __unicode__(self): return self.decode() def __str__(self): - return self.encode() + if PY3K: + return self.decode() + else: + return self.encode() if PY3K: __str__ = __repr__ = __unicode__ @@ -1103,12 +1162,18 @@ def decode_contents(self, indent_level=None, formatter="minimal"): """Renders the contents of this tag as a Unicode string. + :param indent_level: Each line of the rendering will be + indented this many spaces. + :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a tag that mentions the document's encoding. + + :param formatter: The output formatter responsible for converting + entities to Unicode characters. """ # First off, turn a string formatter into a function. This # will stop the lookup from happening over and over again. @@ -1137,7 +1202,17 @@ def decode_contents(self, indent_level=None, def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): - """Renders the contents of this tag as a bytestring.""" + """Renders the contents of this tag as a bytestring. + + :param indent_level: Each line of the rendering will be + indented this many spaces. + + :param eventual_encoding: The bytestring will be in this encoding. + + :param formatter: The output formatter responsible for converting + entities to Unicode characters. + """ + contents = self.decode_contents(indent_level, encoding, formatter) return contents.encode(encoding) @@ -1201,63 +1276,89 @@ def descendants(self): _selector_combinators = ['>', '+', '~'] _select_debug = False - def select(self, selector, _candidate_generator=None): + def select_one(self, selector): """Perform a CSS selection operation on the current element.""" - tokens = selector.split() + value = self.select(selector, limit=1) + if value: + return value[0] + return None + + def select(self, selector, _candidate_generator=None, limit=None): + """Perform a CSS selection operation on the current element.""" + + # Remove whitespace directly after the grouping operator ',' + # then split into tokens. + tokens = re.sub(',[\s]*',',', selector).split() current_context = [self] if tokens[-1] in self._selector_combinators: raise ValueError( 'Final combinator "%s" is missing an argument.' % tokens[-1]) + if self._select_debug: print 'Running CSS selector "%s"' % selector - for index, token in enumerate(tokens): - if self._select_debug: - print ' Considering token "%s"' % token - recursive_candidate_generator = None - tag_name = None + + for index, token_group in enumerate(tokens): + new_context = [] + new_context_ids = set([]) + + # Grouping selectors, ie: p,a + grouped_tokens = token_group.split(',') + if '' in grouped_tokens: + raise ValueError('Invalid group selection syntax: %s' % token_group) + if tokens[index-1] in self._selector_combinators: # This token was consumed by the previous combinator. Skip it. if self._select_debug: print ' Token was consumed by the previous combinator.' continue - # Each operation corresponds to a checker function, a rule - # for determining whether a candidate matches the - # selector. Candidates are generated by the active - # iterator. - checker = None - - m = self.attribselect_re.match(token) - if m is not None: - # Attribute selector - tag_name, attribute, operator, value = m.groups() - checker = self._attribute_checker(operator, attribute, value) - - elif '#' in token: - # ID selector - tag_name, tag_id = token.split('#', 1) - def id_matches(tag): - return tag.get('id', None) == tag_id - checker = id_matches - - elif '.' in token: - # Class selector - tag_name, klass = token.split('.', 1) - classes = set(klass.split('.')) - def classes_match(candidate): - return classes.issubset(candidate.get('class', [])) - checker = classes_match - - elif ':' in token: - # Pseudo-class - tag_name, pseudo = token.split(':', 1) - if tag_name == '': - raise ValueError( - "A pseudo-class must be prefixed with a tag name.") - pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) - found = [] - if pseudo_attributes is not None: - pseudo_type, pseudo_value = pseudo_attributes.groups() + + for token in grouped_tokens: + if self._select_debug: + print ' Considering token "%s"' % token + recursive_candidate_generator = None + tag_name = None + + # Each operation corresponds to a checker function, a rule + # for determining whether a candidate matches the + # selector. Candidates are generated by the active + # iterator. + checker = None + + m = self.attribselect_re.match(token) + if m is not None: + # Attribute selector + tag_name, attribute, operator, value = m.groups() + checker = self._attribute_checker(operator, attribute, value) + + elif '#' in token: + # ID selector + tag_name, tag_id = token.split('#', 1) + def id_matches(tag): + return tag.get('id', None) == tag_id + checker = id_matches + + elif '.' in token: + # Class selector + tag_name, klass = token.split('.', 1) + classes = set(klass.split('.')) + def classes_match(candidate): + return classes.issubset(candidate.get('class', [])) + checker = classes_match + + elif ':' in token: + # Pseudo-class + tag_name, pseudo = token.split(':', 1) + if tag_name == '': + raise ValueError( + "A pseudo-class must be prefixed with a tag name.") + pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) + found = [] + if pseudo_attributes is None: + pseudo_type = pseudo + pseudo_value = None + else: + pseudo_type, pseudo_value = pseudo_attributes.groups() if pseudo_type == 'nth-of-type': try: pseudo_value = int(pseudo_value) @@ -1286,109 +1387,110 @@ def nth_child_of_type(self, tag): raise NotImplementedError( 'Only the following pseudo-classes are implemented: nth-of-type.') - elif token == '*': - # Star selector -- matches everything - pass - elif token == '>': - # Run the next token as a CSS selector against the - # direct children of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.children - elif token == '~': - # Run the next token as a CSS selector against the - # siblings of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.next_siblings - elif token == '+': - # For each tag in the current context, run the next - # token as a CSS selector against the tag's next - # sibling that's a tag. - def next_tag_sibling(tag): - yield tag.find_next_sibling(True) - recursive_candidate_generator = next_tag_sibling - - elif self.tag_name_re.match(token): - # Just a tag name. - tag_name = token - else: - raise ValueError( - 'Unsupported or invalid CSS selector: "%s"' % token) - - if recursive_candidate_generator: - # This happens when the selector looks like "> foo". - # - # The generator calls select() recursively on every - # member of the current context, passing in a different - # candidate generator and a different selector. - # - # In the case of "> foo", the candidate generator is - # one that yields a tag's direct children (">"), and - # the selector is "foo". - next_token = tokens[index+1] - def recursive_select(tag): - if self._select_debug: - print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) - print '-' * 40 - for i in tag.select(next_token, recursive_candidate_generator): + elif token == '*': + # Star selector -- matches everything + pass + elif token == '>': + # Run the next token as a CSS selector against the + # direct children of each tag in the current context. + recursive_candidate_generator = lambda tag: tag.children + elif token == '~': + # Run the next token as a CSS selector against the + # siblings of each tag in the current context. + recursive_candidate_generator = lambda tag: tag.next_siblings + elif token == '+': + # For each tag in the current context, run the next + # token as a CSS selector against the tag's next + # sibling that's a tag. + def next_tag_sibling(tag): + yield tag.find_next_sibling(True) + recursive_candidate_generator = next_tag_sibling + + elif self.tag_name_re.match(token): + # Just a tag name. + tag_name = token + else: + raise ValueError( + 'Unsupported or invalid CSS selector: "%s"' % token) + if recursive_candidate_generator: + # This happens when the selector looks like "> foo". + # + # The generator calls select() recursively on every + # member of the current context, passing in a different + # candidate generator and a different selector. + # + # In the case of "> foo", the candidate generator is + # one that yields a tag's direct children (">"), and + # the selector is "foo". + next_token = tokens[index+1] + def recursive_select(tag): + if self._select_debug: + print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) + print '-' * 40 + for i in tag.select(next_token, recursive_candidate_generator): + if self._select_debug: + print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) + yield i if self._select_debug: - print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) - yield i + print '-' * 40 + _use_candidate_generator = recursive_select + elif _candidate_generator is None: + # By default, a tag's candidates are all of its + # children. If tag_name is defined, only yield tags + # with that name. if self._select_debug: - print '-' * 40 - _use_candidate_generator = recursive_select - elif _candidate_generator is None: - # By default, a tag's candidates are all of its - # children. If tag_name is defined, only yield tags - # with that name. - if self._select_debug: - if tag_name: - check = "[any]" + if tag_name: + check = "[any]" + else: + check = tag_name + print ' Default candidate generator, tag name="%s"' % check + if self._select_debug: + # This is redundant with later code, but it stops + # a bunch of bogus tags from cluttering up the + # debug log. + def default_candidate_generator(tag): + for child in tag.descendants: + if not isinstance(child, Tag): + continue + if tag_name and not child.name == tag_name: + continue + yield child + _use_candidate_generator = default_candidate_generator else: - check = tag_name - print ' Default candidate generator, tag name="%s"' % check - if self._select_debug: - # This is redundant with later code, but it stops - # a bunch of bogus tags from cluttering up the - # debug log. - def default_candidate_generator(tag): - for child in tag.descendants: - if not isinstance(child, Tag): - continue - if tag_name and not child.name == tag_name: - continue - yield child - _use_candidate_generator = default_candidate_generator + _use_candidate_generator = lambda tag: tag.descendants else: - _use_candidate_generator = lambda tag: tag.descendants - else: - _use_candidate_generator = _candidate_generator + _use_candidate_generator = _candidate_generator + + count = 0 + for tag in current_context: + if self._select_debug: + print " Running candidate generator on %s %s" % ( + tag.name, repr(tag.attrs)) + for candidate in _use_candidate_generator(tag): + if not isinstance(candidate, Tag): + continue + if tag_name and candidate.name != tag_name: + continue + if checker is not None: + try: + result = checker(candidate) + except StopIteration: + # The checker has decided we should no longer + # run the generator. + break + if checker is None or result: + if self._select_debug: + print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) + if id(candidate) not in new_context_ids: + # If a tag matches a selector more than once, + # don't include it in the context more than once. + new_context.append(candidate) + new_context_ids.add(id(candidate)) + if limit and len(new_context) >= limit: + break + elif self._select_debug: + print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) - new_context = [] - new_context_ids = set([]) - for tag in current_context: - if self._select_debug: - print " Running candidate generator on %s %s" % ( - tag.name, repr(tag.attrs)) - for candidate in _use_candidate_generator(tag): - if not isinstance(candidate, Tag): - continue - if tag_name and candidate.name != tag_name: - continue - if checker is not None: - try: - result = checker(candidate) - except StopIteration: - # The checker has decided we should no longer - # run the generator. - break - if checker is None or result: - if self._select_debug: - print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) - if id(candidate) not in new_context_ids: - # If a tag matches a selector more than once, - # don't include it in the context more than once. - new_context.append(candidate) - new_context_ids.add(id(candidate)) - elif self._select_debug: - print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) current_context = new_context diff --git a/lib/bs4/testing.py b/lib/bs4/testing.py index fd4495ac5..9e5e295ef 100644 --- a/lib/bs4/testing.py +++ b/lib/bs4/testing.py @@ -1,5 +1,6 @@ """Helper classes for tests.""" +import pickle import copy import functools import unittest @@ -43,6 +44,16 @@ def assertSoupEquals(self, to_parse, compare_parsed_to=None): self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) + def assertConnectedness(self, element): + """Ensure that next_element and previous_element are properly + set for all descendants of the given element. + """ + earlier = None + for e in element.descendants: + if earlier: + self.assertEqual(e, earlier.next_element) + self.assertEqual(earlier, e.previous_element) + earlier = e class HTMLTreeBuilderSmokeTest(object): @@ -54,6 +65,15 @@ class HTMLTreeBuilderSmokeTest(object): markup in these tests, there's not much room for interpretation. """ + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("foo") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + def assertDoctypeHandled(self, doctype_fragment): """Assert that a given doctype string is handled correctly.""" doctype_str, soup = self._document_with_doctype(doctype_fragment) @@ -114,6 +134,11 @@ def test_real_xhtml_document(self): soup.encode("utf-8").replace(b"\n", b""), markup.replace(b"\n", b"")) + def test_processing_instruction(self): + markup = b"""""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + def test_deepcopy(self): """Make sure you can copy the tree builder. @@ -155,6 +180,23 @@ def test_br_is_always_empty_element_tag(self): def test_nested_formatting_elements(self): self.assertSoupEquals("") + def test_double_head(self): + html = ''' + + +Ordinary HEAD element test + + + +Hello, world! + + +''' + soup = self.soup(html) + self.assertEqual("text/javascript", soup.find('script')['type']) + def test_comment(self): # Comments are represented as Comment objects. markup = "

foobaz

" @@ -221,6 +263,14 @@ def test_deeply_nested_multivalued_attribute(self): soup = self.soup(markup) self.assertEqual(["css"], soup.div.div['class']) + def test_multivalued_attribute_on_html(self): + # html5lib uses a different API to set the attributes ot the + # tag. This has caused problems with multivalued + # attributes. + markup = '' + soup = self.soup(markup) + self.assertEqual(["a", "b"], soup.html['class']) + def test_angle_brackets_in_attribute_values_are_escaped(self): self.assertSoupEquals('', '') @@ -253,6 +303,35 @@ def test_multipart_strings(self): soup = self.soup("

\nfoo

") self.assertEqual("p", soup.h2.string.next_element.name) self.assertEqual("p", soup.p.name) + self.assertConnectedness(soup) + + def test_head_tag_between_head_and_body(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """ + + foo + +""" + soup = self.soup(content) + self.assertNotEqual(None, soup.html.body) + self.assertConnectedness(soup) + + def test_multiple_copies_of_a_tag(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """ + + + + + +""" + soup = self.soup(content) + self.assertConnectedness(soup.article) def test_basic_namespaces(self): """Parsers don't need to *understand* namespaces, but at the @@ -463,6 +542,15 @@ def test_tag_with_no_attributes_can_have_attributes_added(self): class XMLTreeBuilderSmokeTest(object): + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("foo") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + def test_docstring_generated(self): soup = self.soup("") self.assertEqual( @@ -485,7 +573,7 @@ def test_formatter_processes_script_tag_for_xml_documents(self): """ - soup = BeautifulSoup(doc, "xml") + soup = BeautifulSoup(doc, "lxml-xml") # lxml would have stripped this while parsing, but we can add # it later. soup.script.string = 'console.log("< < hey > > ");' diff --git a/lib/html5lib/__init__.py b/lib/html5lib/__init__.py index 19a4b7d69..3ba1163cf 100644 --- a/lib/html5lib/__init__.py +++ b/lib/html5lib/__init__.py @@ -20,4 +20,6 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder", "getTreeWalker", "serialize"] -__version__ = "0.999" + +# this has to be at the top level, see how setup.py parses this +__version__ = "0.999999" diff --git a/lib/html5lib/constants.py b/lib/html5lib/constants.py index e7089846d..d938e0ae6 100644 --- a/lib/html5lib/constants.py +++ b/lib/html5lib/constants.py @@ -1,292 +1,290 @@ from __future__ import absolute_import, division, unicode_literals import string -import gettext -_ = gettext.gettext EOF = None E = { "null-character": - _("Null character in input stream, replaced with U+FFFD."), + "Null character in input stream, replaced with U+FFFD.", "invalid-codepoint": - _("Invalid codepoint in stream."), + "Invalid codepoint in stream.", "incorrectly-placed-solidus": - _("Solidus (/) incorrectly placed in tag."), + "Solidus (/) incorrectly placed in tag.", "incorrect-cr-newline-entity": - _("Incorrect CR newline entity, replaced with LF."), + "Incorrect CR newline entity, replaced with LF.", "illegal-windows-1252-entity": - _("Entity used with illegal number (windows-1252 reference)."), + "Entity used with illegal number (windows-1252 reference).", "cant-convert-numeric-entity": - _("Numeric entity couldn't be converted to character " - "(codepoint U+%(charAsInt)08x)."), + "Numeric entity couldn't be converted to character " + "(codepoint U+%(charAsInt)08x).", "illegal-codepoint-for-numeric-entity": - _("Numeric entity represents an illegal codepoint: " - "U+%(charAsInt)08x."), + "Numeric entity represents an illegal codepoint: " + "U+%(charAsInt)08x.", "numeric-entity-without-semicolon": - _("Numeric entity didn't end with ';'."), + "Numeric entity didn't end with ';'.", "expected-numeric-entity-but-got-eof": - _("Numeric entity expected. Got end of file instead."), + "Numeric entity expected. Got end of file instead.", "expected-numeric-entity": - _("Numeric entity expected but none found."), + "Numeric entity expected but none found.", "named-entity-without-semicolon": - _("Named entity didn't end with ';'."), + "Named entity didn't end with ';'.", "expected-named-entity": - _("Named entity expected. Got none."), + "Named entity expected. Got none.", "attributes-in-end-tag": - _("End tag contains unexpected attributes."), + "End tag contains unexpected attributes.", 'self-closing-flag-on-end-tag': - _("End tag contains unexpected self-closing flag."), + "End tag contains unexpected self-closing flag.", "expected-tag-name-but-got-right-bracket": - _("Expected tag name. Got '>' instead."), + "Expected tag name. Got '>' instead.", "expected-tag-name-but-got-question-mark": - _("Expected tag name. Got '?' instead. (HTML doesn't " - "support processing instructions.)"), + "Expected tag name. Got '?' instead. (HTML doesn't " + "support processing instructions.)", "expected-tag-name": - _("Expected tag name. Got something else instead"), + "Expected tag name. Got something else instead", "expected-closing-tag-but-got-right-bracket": - _("Expected closing tag. Got '>' instead. Ignoring ''."), + "Expected closing tag. Got '>' instead. Ignoring ''.", "expected-closing-tag-but-got-eof": - _("Expected closing tag. Unexpected end of file."), + "Expected closing tag. Unexpected end of file.", "expected-closing-tag-but-got-char": - _("Expected closing tag. Unexpected character '%(data)s' found."), + "Expected closing tag. Unexpected character '%(data)s' found.", "eof-in-tag-name": - _("Unexpected end of file in the tag name."), + "Unexpected end of file in the tag name.", "expected-attribute-name-but-got-eof": - _("Unexpected end of file. Expected attribute name instead."), + "Unexpected end of file. Expected attribute name instead.", "eof-in-attribute-name": - _("Unexpected end of file in attribute name."), + "Unexpected end of file in attribute name.", "invalid-character-in-attribute-name": - _("Invalid character in attribute name"), + "Invalid character in attribute name", "duplicate-attribute": - _("Dropped duplicate attribute on tag."), + "Dropped duplicate attribute on tag.", "expected-end-of-tag-name-but-got-eof": - _("Unexpected end of file. Expected = or end of tag."), + "Unexpected end of file. Expected = or end of tag.", "expected-attribute-value-but-got-eof": - _("Unexpected end of file. Expected attribute value."), + "Unexpected end of file. Expected attribute value.", "expected-attribute-value-but-got-right-bracket": - _("Expected attribute value. Got '>' instead."), + "Expected attribute value. Got '>' instead.", 'equals-in-unquoted-attribute-value': - _("Unexpected = in unquoted attribute"), + "Unexpected = in unquoted attribute", 'unexpected-character-in-unquoted-attribute-value': - _("Unexpected character in unquoted attribute"), + "Unexpected character in unquoted attribute", "invalid-character-after-attribute-name": - _("Unexpected character after attribute name."), + "Unexpected character after attribute name.", "unexpected-character-after-attribute-value": - _("Unexpected character after attribute value."), + "Unexpected character after attribute value.", "eof-in-attribute-value-double-quote": - _("Unexpected end of file in attribute value (\")."), + "Unexpected end of file in attribute value (\").", "eof-in-attribute-value-single-quote": - _("Unexpected end of file in attribute value (')."), + "Unexpected end of file in attribute value (').", "eof-in-attribute-value-no-quotes": - _("Unexpected end of file in attribute value."), + "Unexpected end of file in attribute value.", "unexpected-EOF-after-solidus-in-tag": - _("Unexpected end of file in tag. Expected >"), + "Unexpected end of file in tag. Expected >", "unexpected-character-after-solidus-in-tag": - _("Unexpected character after / in tag. Expected >"), + "Unexpected character after / in tag. Expected >", "expected-dashes-or-doctype": - _("Expected '--' or 'DOCTYPE'. Not found."), + "Expected '--' or 'DOCTYPE'. Not found.", "unexpected-bang-after-double-dash-in-comment": - _("Unexpected ! after -- in comment"), + "Unexpected ! after -- in comment", "unexpected-space-after-double-dash-in-comment": - _("Unexpected space after -- in comment"), + "Unexpected space after -- in comment", "incorrect-comment": - _("Incorrect comment."), + "Incorrect comment.", "eof-in-comment": - _("Unexpected end of file in comment."), + "Unexpected end of file in comment.", "eof-in-comment-end-dash": - _("Unexpected end of file in comment (-)"), + "Unexpected end of file in comment (-)", "unexpected-dash-after-double-dash-in-comment": - _("Unexpected '-' after '--' found in comment."), + "Unexpected '-' after '--' found in comment.", "eof-in-comment-double-dash": - _("Unexpected end of file in comment (--)."), + "Unexpected end of file in comment (--).", "eof-in-comment-end-space-state": - _("Unexpected end of file in comment."), + "Unexpected end of file in comment.", "eof-in-comment-end-bang-state": - _("Unexpected end of file in comment."), + "Unexpected end of file in comment.", "unexpected-char-in-comment": - _("Unexpected character in comment found."), + "Unexpected character in comment found.", "need-space-after-doctype": - _("No space after literal string 'DOCTYPE'."), + "No space after literal string 'DOCTYPE'.", "expected-doctype-name-but-got-right-bracket": - _("Unexpected > character. Expected DOCTYPE name."), + "Unexpected > character. Expected DOCTYPE name.", "expected-doctype-name-but-got-eof": - _("Unexpected end of file. Expected DOCTYPE name."), + "Unexpected end of file. Expected DOCTYPE name.", "eof-in-doctype-name": - _("Unexpected end of file in DOCTYPE name."), + "Unexpected end of file in DOCTYPE name.", "eof-in-doctype": - _("Unexpected end of file in DOCTYPE."), + "Unexpected end of file in DOCTYPE.", "expected-space-or-right-bracket-in-doctype": - _("Expected space or '>'. Got '%(data)s'"), + "Expected space or '>'. Got '%(data)s'", "unexpected-end-of-doctype": - _("Unexpected end of DOCTYPE."), + "Unexpected end of DOCTYPE.", "unexpected-char-in-doctype": - _("Unexpected character in DOCTYPE."), + "Unexpected character in DOCTYPE.", "eof-in-innerhtml": - _("XXX innerHTML EOF"), + "XXX innerHTML EOF", "unexpected-doctype": - _("Unexpected DOCTYPE. Ignored."), + "Unexpected DOCTYPE. Ignored.", "non-html-root": - _("html needs to be the first start tag."), + "html needs to be the first start tag.", "expected-doctype-but-got-eof": - _("Unexpected End of file. Expected DOCTYPE."), + "Unexpected End of file. Expected DOCTYPE.", "unknown-doctype": - _("Erroneous DOCTYPE."), + "Erroneous DOCTYPE.", "expected-doctype-but-got-chars": - _("Unexpected non-space characters. Expected DOCTYPE."), + "Unexpected non-space characters. Expected DOCTYPE.", "expected-doctype-but-got-start-tag": - _("Unexpected start tag (%(name)s). Expected DOCTYPE."), + "Unexpected start tag (%(name)s). Expected DOCTYPE.", "expected-doctype-but-got-end-tag": - _("Unexpected end tag (%(name)s). Expected DOCTYPE."), + "Unexpected end tag (%(name)s). Expected DOCTYPE.", "end-tag-after-implied-root": - _("Unexpected end tag (%(name)s) after the (implied) root element."), + "Unexpected end tag (%(name)s) after the (implied) root element.", "expected-named-closing-tag-but-got-eof": - _("Unexpected end of file. Expected end tag (%(name)s)."), + "Unexpected end of file. Expected end tag (%(name)s).", "two-heads-are-not-better-than-one": - _("Unexpected start tag head in existing head. Ignored."), + "Unexpected start tag head in existing head. Ignored.", "unexpected-end-tag": - _("Unexpected end tag (%(name)s). Ignored."), + "Unexpected end tag (%(name)s). Ignored.", "unexpected-start-tag-out-of-my-head": - _("Unexpected start tag (%(name)s) that can be in head. Moved."), + "Unexpected start tag (%(name)s) that can be in head. Moved.", "unexpected-start-tag": - _("Unexpected start tag (%(name)s)."), + "Unexpected start tag (%(name)s).", "missing-end-tag": - _("Missing end tag (%(name)s)."), + "Missing end tag (%(name)s).", "missing-end-tags": - _("Missing end tags (%(name)s)."), + "Missing end tags (%(name)s).", "unexpected-start-tag-implies-end-tag": - _("Unexpected start tag (%(startName)s) " - "implies end tag (%(endName)s)."), + "Unexpected start tag (%(startName)s) " + "implies end tag (%(endName)s).", "unexpected-start-tag-treated-as": - _("Unexpected start tag (%(originalName)s). Treated as %(newName)s."), + "Unexpected start tag (%(originalName)s). Treated as %(newName)s.", "deprecated-tag": - _("Unexpected start tag %(name)s. Don't use it!"), + "Unexpected start tag %(name)s. Don't use it!", "unexpected-start-tag-ignored": - _("Unexpected start tag %(name)s. Ignored."), + "Unexpected start tag %(name)s. Ignored.", "expected-one-end-tag-but-got-another": - _("Unexpected end tag (%(gotName)s). " - "Missing end tag (%(expectedName)s)."), + "Unexpected end tag (%(gotName)s). " + "Missing end tag (%(expectedName)s).", "end-tag-too-early": - _("End tag (%(name)s) seen too early. Expected other end tag."), + "End tag (%(name)s) seen too early. Expected other end tag.", "end-tag-too-early-named": - _("Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), + "Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s).", "end-tag-too-early-ignored": - _("End tag (%(name)s) seen too early. Ignored."), + "End tag (%(name)s) seen too early. Ignored.", "adoption-agency-1.1": - _("End tag (%(name)s) violates step 1, " - "paragraph 1 of the adoption agency algorithm."), + "End tag (%(name)s) violates step 1, " + "paragraph 1 of the adoption agency algorithm.", "adoption-agency-1.2": - _("End tag (%(name)s) violates step 1, " - "paragraph 2 of the adoption agency algorithm."), + "End tag (%(name)s) violates step 1, " + "paragraph 2 of the adoption agency algorithm.", "adoption-agency-1.3": - _("End tag (%(name)s) violates step 1, " - "paragraph 3 of the adoption agency algorithm."), + "End tag (%(name)s) violates step 1, " + "paragraph 3 of the adoption agency algorithm.", "adoption-agency-4.4": - _("End tag (%(name)s) violates step 4, " - "paragraph 4 of the adoption agency algorithm."), + "End tag (%(name)s) violates step 4, " + "paragraph 4 of the adoption agency algorithm.", "unexpected-end-tag-treated-as": - _("Unexpected end tag (%(originalName)s). Treated as %(newName)s."), + "Unexpected end tag (%(originalName)s). Treated as %(newName)s.", "no-end-tag": - _("This element (%(name)s) has no end tag."), + "This element (%(name)s) has no end tag.", "unexpected-implied-end-tag-in-table": - _("Unexpected implied end tag (%(name)s) in the table phase."), + "Unexpected implied end tag (%(name)s) in the table phase.", "unexpected-implied-end-tag-in-table-body": - _("Unexpected implied end tag (%(name)s) in the table body phase."), + "Unexpected implied end tag (%(name)s) in the table body phase.", "unexpected-char-implies-table-voodoo": - _("Unexpected non-space characters in " - "table context caused voodoo mode."), + "Unexpected non-space characters in " + "table context caused voodoo mode.", "unexpected-hidden-input-in-table": - _("Unexpected input with type hidden in table context."), + "Unexpected input with type hidden in table context.", "unexpected-form-in-table": - _("Unexpected form in table context."), + "Unexpected form in table context.", "unexpected-start-tag-implies-table-voodoo": - _("Unexpected start tag (%(name)s) in " - "table context caused voodoo mode."), + "Unexpected start tag (%(name)s) in " + "table context caused voodoo mode.", "unexpected-end-tag-implies-table-voodoo": - _("Unexpected end tag (%(name)s) in " - "table context caused voodoo mode."), + "Unexpected end tag (%(name)s) in " + "table context caused voodoo mode.", "unexpected-cell-in-table-body": - _("Unexpected table cell start tag (%(name)s) " - "in the table body phase."), + "Unexpected table cell start tag (%(name)s) " + "in the table body phase.", "unexpected-cell-end-tag": - _("Got table cell end tag (%(name)s) " - "while required end tags are missing."), + "Got table cell end tag (%(name)s) " + "while required end tags are missing.", "unexpected-end-tag-in-table-body": - _("Unexpected end tag (%(name)s) in the table body phase. Ignored."), + "Unexpected end tag (%(name)s) in the table body phase. Ignored.", "unexpected-implied-end-tag-in-table-row": - _("Unexpected implied end tag (%(name)s) in the table row phase."), + "Unexpected implied end tag (%(name)s) in the table row phase.", "unexpected-end-tag-in-table-row": - _("Unexpected end tag (%(name)s) in the table row phase. Ignored."), + "Unexpected end tag (%(name)s) in the table row phase. Ignored.", "unexpected-select-in-select": - _("Unexpected select start tag in the select phase " - "treated as select end tag."), + "Unexpected select start tag in the select phase " + "treated as select end tag.", "unexpected-input-in-select": - _("Unexpected input start tag in the select phase."), + "Unexpected input start tag in the select phase.", "unexpected-start-tag-in-select": - _("Unexpected start tag token (%(name)s in the select phase. " - "Ignored."), + "Unexpected start tag token (%(name)s in the select phase. " + "Ignored.", "unexpected-end-tag-in-select": - _("Unexpected end tag (%(name)s) in the select phase. Ignored."), + "Unexpected end tag (%(name)s) in the select phase. Ignored.", "unexpected-table-element-start-tag-in-select-in-table": - _("Unexpected table element start tag (%(name)s) in the select in table phase."), + "Unexpected table element start tag (%(name)s) in the select in table phase.", "unexpected-table-element-end-tag-in-select-in-table": - _("Unexpected table element end tag (%(name)s) in the select in table phase."), + "Unexpected table element end tag (%(name)s) in the select in table phase.", "unexpected-char-after-body": - _("Unexpected non-space characters in the after body phase."), + "Unexpected non-space characters in the after body phase.", "unexpected-start-tag-after-body": - _("Unexpected start tag token (%(name)s)" - " in the after body phase."), + "Unexpected start tag token (%(name)s)" + " in the after body phase.", "unexpected-end-tag-after-body": - _("Unexpected end tag token (%(name)s)" - " in the after body phase."), + "Unexpected end tag token (%(name)s)" + " in the after body phase.", "unexpected-char-in-frameset": - _("Unexpected characters in the frameset phase. Characters ignored."), + "Unexpected characters in the frameset phase. Characters ignored.", "unexpected-start-tag-in-frameset": - _("Unexpected start tag token (%(name)s)" - " in the frameset phase. Ignored."), + "Unexpected start tag token (%(name)s)" + " in the frameset phase. Ignored.", "unexpected-frameset-in-frameset-innerhtml": - _("Unexpected end tag token (frameset) " - "in the frameset phase (innerHTML)."), + "Unexpected end tag token (frameset) " + "in the frameset phase (innerHTML).", "unexpected-end-tag-in-frameset": - _("Unexpected end tag token (%(name)s)" - " in the frameset phase. Ignored."), + "Unexpected end tag token (%(name)s)" + " in the frameset phase. Ignored.", "unexpected-char-after-frameset": - _("Unexpected non-space characters in the " - "after frameset phase. Ignored."), + "Unexpected non-space characters in the " + "after frameset phase. Ignored.", "unexpected-start-tag-after-frameset": - _("Unexpected start tag (%(name)s)" - " in the after frameset phase. Ignored."), + "Unexpected start tag (%(name)s)" + " in the after frameset phase. Ignored.", "unexpected-end-tag-after-frameset": - _("Unexpected end tag (%(name)s)" - " in the after frameset phase. Ignored."), + "Unexpected end tag (%(name)s)" + " in the after frameset phase. Ignored.", "unexpected-end-tag-after-body-innerhtml": - _("Unexpected end tag after body(innerHtml)"), + "Unexpected end tag after body(innerHtml)", "expected-eof-but-got-char": - _("Unexpected non-space characters. Expected end of file."), + "Unexpected non-space characters. Expected end of file.", "expected-eof-but-got-start-tag": - _("Unexpected start tag (%(name)s)" - ". Expected end of file."), + "Unexpected start tag (%(name)s)" + ". Expected end of file.", "expected-eof-but-got-end-tag": - _("Unexpected end tag (%(name)s)" - ". Expected end of file."), + "Unexpected end tag (%(name)s)" + ". Expected end of file.", "eof-in-table": - _("Unexpected end of file. Expected table content."), + "Unexpected end of file. Expected table content.", "eof-in-select": - _("Unexpected end of file. Expected select content."), + "Unexpected end of file. Expected select content.", "eof-in-frameset": - _("Unexpected end of file. Expected frameset content."), + "Unexpected end of file. Expected frameset content.", "eof-in-script-in-script": - _("Unexpected end of file. Expected script content."), + "Unexpected end of file. Expected script content.", "eof-in-foreign-lands": - _("Unexpected end of file. Expected foreign content"), + "Unexpected end of file. Expected foreign content", "non-void-element-with-trailing-solidus": - _("Trailing solidus not allowed on element %(name)s"), + "Trailing solidus not allowed on element %(name)s", "unexpected-html-element-in-foreign-content": - _("Element %(name)s not allowed in a non-html context"), + "Element %(name)s not allowed in a non-html context", "unexpected-end-tag-before-html": - _("Unexpected end tag (%(name)s) before html."), + "Unexpected end tag (%(name)s) before html.", "XXX-undefined-error": - _("Undefined error (this sucks and should be fixed)"), + "Undefined error (this sucks and should be fixed)", } namespaces = { @@ -298,7 +296,7 @@ "xmlns": "http://www.w3.org/2000/xmlns/" } -scopingElements = frozenset(( +scopingElements = frozenset([ (namespaces["html"], "applet"), (namespaces["html"], "caption"), (namespaces["html"], "html"), @@ -316,9 +314,9 @@ (namespaces["svg"], "foreignObject"), (namespaces["svg"], "desc"), (namespaces["svg"], "title"), -)) +]) -formattingElements = frozenset(( +formattingElements = frozenset([ (namespaces["html"], "a"), (namespaces["html"], "b"), (namespaces["html"], "big"), @@ -333,9 +331,9 @@ (namespaces["html"], "strong"), (namespaces["html"], "tt"), (namespaces["html"], "u") -)) +]) -specialElements = frozenset(( +specialElements = frozenset([ (namespaces["html"], "address"), (namespaces["html"], "applet"), (namespaces["html"], "area"), @@ -416,22 +414,22 @@ (namespaces["html"], "wbr"), (namespaces["html"], "xmp"), (namespaces["svg"], "foreignObject") -)) +]) -htmlIntegrationPointElements = frozenset(( +htmlIntegrationPointElements = frozenset([ (namespaces["mathml"], "annotaion-xml"), (namespaces["svg"], "foreignObject"), (namespaces["svg"], "desc"), (namespaces["svg"], "title") -)) +]) -mathmlTextIntegrationPointElements = frozenset(( +mathmlTextIntegrationPointElements = frozenset([ (namespaces["mathml"], "mi"), (namespaces["mathml"], "mo"), (namespaces["mathml"], "mn"), (namespaces["mathml"], "ms"), (namespaces["mathml"], "mtext") -)) +]) adjustForeignAttributes = { "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]), @@ -451,21 +449,21 @@ unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in adjustForeignAttributes.items()]) -spaceCharacters = frozenset(( +spaceCharacters = frozenset([ "\t", "\n", "\u000C", " ", "\r" -)) +]) -tableInsertModeElements = frozenset(( +tableInsertModeElements = frozenset([ "table", "tbody", "tfoot", "thead", "tr" -)) +]) asciiLowercase = frozenset(string.ascii_lowercase) asciiUppercase = frozenset(string.ascii_uppercase) @@ -486,7 +484,7 @@ "h6" ) -voidElements = frozenset(( +voidElements = frozenset([ "base", "command", "event-source", @@ -502,11 +500,11 @@ "input", "source", "track" -)) +]) -cdataElements = frozenset(('title', 'textarea')) +cdataElements = frozenset(['title', 'textarea']) -rcdataElements = frozenset(( +rcdataElements = frozenset([ 'style', 'script', 'xmp', @@ -514,27 +512,27 @@ 'noembed', 'noframes', 'noscript' -)) +]) booleanAttributes = { - "": frozenset(("irrelevant",)), - "style": frozenset(("scoped",)), - "img": frozenset(("ismap",)), - "audio": frozenset(("autoplay", "controls")), - "video": frozenset(("autoplay", "controls")), - "script": frozenset(("defer", "async")), - "details": frozenset(("open",)), - "datagrid": frozenset(("multiple", "disabled")), - "command": frozenset(("hidden", "disabled", "checked", "default")), - "hr": frozenset(("noshade")), - "menu": frozenset(("autosubmit",)), - "fieldset": frozenset(("disabled", "readonly")), - "option": frozenset(("disabled", "readonly", "selected")), - "optgroup": frozenset(("disabled", "readonly")), - "button": frozenset(("disabled", "autofocus")), - "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")), - "select": frozenset(("disabled", "readonly", "autofocus", "multiple")), - "output": frozenset(("disabled", "readonly")), + "": frozenset(["irrelevant"]), + "style": frozenset(["scoped"]), + "img": frozenset(["ismap"]), + "audio": frozenset(["autoplay", "controls"]), + "video": frozenset(["autoplay", "controls"]), + "script": frozenset(["defer", "async"]), + "details": frozenset(["open"]), + "datagrid": frozenset(["multiple", "disabled"]), + "command": frozenset(["hidden", "disabled", "checked", "default"]), + "hr": frozenset(["noshade"]), + "menu": frozenset(["autosubmit"]), + "fieldset": frozenset(["disabled", "readonly"]), + "option": frozenset(["disabled", "readonly", "selected"]), + "optgroup": frozenset(["disabled", "readonly"]), + "button": frozenset(["disabled", "autofocus"]), + "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]), + "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]), + "output": frozenset(["disabled", "readonly"]), } # entitiesWindows1252 has to be _ordered_ and needs to have an index. It @@ -574,7 +572,7 @@ 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS ) -xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;')) +xmlEntities = frozenset(['lt;', 'gt;', 'amp;', 'apos;', 'quot;']) entities = { "AElig": "\xc6", @@ -3088,8 +3086,8 @@ "ParseError": 7 } -tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], - tokenTypes["EmptyTag"])) +tagTokenTypes = frozenset([tokenTypes["StartTag"], tokenTypes["EndTag"], + tokenTypes["EmptyTag"]]) prefixes = dict([(v, k) for k, v in namespaces.items()]) diff --git a/lib/html5lib/filters/lint.py b/lib/html5lib/filters/lint.py index 7cc99a4ba..8884696dc 100644 --- a/lib/html5lib/filters/lint.py +++ b/lib/html5lib/filters/lint.py @@ -1,8 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from gettext import gettext -_ = gettext - from . import _base from ..constants import cdataElements, rcdataElements, voidElements @@ -23,24 +20,24 @@ def __iter__(self): if type in ("StartTag", "EmptyTag"): name = token["name"] if contentModelFlag != "PCDATA": - raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name}) + raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name}) if not isinstance(name, str): - raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name}) + raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) if not name: - raise LintError(_("Empty tag name")) + raise LintError("Empty tag name") if type == "StartTag" and name in voidElements: - raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name}) + raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name}) elif type == "EmptyTag" and name not in voidElements: - raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]}) + raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]}) if type == "StartTag": open_elements.append(name) for name, value in token["data"]: if not isinstance(name, str): - raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name}) + raise LintError("Attribute name is not a string: %(name)r" % {"name": name}) if not name: - raise LintError(_("Empty attribute name")) + raise LintError("Empty attribute name") if not isinstance(value, str): - raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value}) + raise LintError("Attribute value is not a string: %(value)r" % {"value": value}) if name in cdataElements: contentModelFlag = "CDATA" elif name in rcdataElements: @@ -51,43 +48,43 @@ def __iter__(self): elif type == "EndTag": name = token["name"] if not isinstance(name, str): - raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name}) + raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) if not name: - raise LintError(_("Empty tag name")) + raise LintError("Empty tag name") if name in voidElements: - raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name}) + raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name}) start_name = open_elements.pop() if start_name != name: - raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name}) + raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name}) contentModelFlag = "PCDATA" elif type == "Comment": if contentModelFlag != "PCDATA": - raise LintError(_("Comment not in PCDATA content model flag")) + raise LintError("Comment not in PCDATA content model flag") elif type in ("Characters", "SpaceCharacters"): data = token["data"] if not isinstance(data, str): - raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data}) + raise LintError("Attribute name is not a string: %(name)r" % {"name": data}) if not data: - raise LintError(_("%(type)s token with empty data") % {"type": type}) + raise LintError("%(type)s token with empty data" % {"type": type}) if type == "SpaceCharacters": data = data.strip(spaceCharacters) if data: - raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data}) + raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data}) elif type == "Doctype": name = token["name"] if contentModelFlag != "PCDATA": - raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name}) + raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name}) if not isinstance(name, str): - raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name}) + raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) # XXX: what to do with token["data"] ? elif type in ("ParseError", "SerializeError"): pass else: - raise LintError(_("Unknown token type: %(type)s") % {"type": type}) + raise LintError("Unknown token type: %(type)s" % {"type": type}) yield token diff --git a/lib/html5lib/html5parser.py b/lib/html5lib/html5parser.py index b0f14f393..12aa6a35e 100644 --- a/lib/html5lib/html5parser.py +++ b/lib/html5lib/html5parser.py @@ -18,6 +18,7 @@ from .constants import tokenTypes, ReparseException, namespaces from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements from .constants import adjustForeignAttributes as adjustForeignAttributesMap +from .constants import E def parse(doc, treebuilder="etree", encoding=None, @@ -129,6 +130,17 @@ def reset(self): self.framesetOK = True + @property + def documentEncoding(self): + """The name of the character encoding + that was used to decode the input stream, + or :obj:`None` if that is not determined yet. + + """ + if not hasattr(self, 'tokenizer'): + return None + return self.tokenizer.stream.charEncoding[0] + def isHTMLIntegrationPoint(self, element): if (element.name == "annotation-xml" and element.namespace == namespaces["mathml"]): @@ -245,7 +257,7 @@ def parseError(self, errorcode="XXX-undefined-error", datavars={}): # XXX The idea is to make errorcode mandatory. self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) if self.strict: - raise ParseError + raise ParseError(E[errorcode] % datavars) def normalizeToken(self, token): """ HTML5 specific normalizations to the token stream """ @@ -868,7 +880,7 @@ def __init__(self, parser, tree): self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), (("base", "basefont", "bgsound", "command", "link", "meta", - "noframes", "script", "style", "title"), + "script", "style", "title"), self.startTagProcessInHead), ("body", self.startTagBody), ("frameset", self.startTagFrameset), @@ -1205,8 +1217,7 @@ def startTagIsIndex(self, token): attributes["name"] = "isindex" self.processStartTag(impliedTagToken("input", "StartTag", attributes=attributes, - selfClosing= - token["selfClosing"])) + selfClosing=token["selfClosing"])) self.processEndTag(impliedTagToken("label")) self.processStartTag(impliedTagToken("hr", "StartTag")) self.processEndTag(impliedTagToken("form")) diff --git a/lib/html5lib/inputstream.py b/lib/html5lib/inputstream.py index 9e03b9313..7020aa60f 100644 --- a/lib/html5lib/inputstream.py +++ b/lib/html5lib/inputstream.py @@ -28,7 +28,18 @@ class BufferedIOBase(object): asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase]) spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) -invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") + +invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" + +if utils.supports_lone_surrogates: + # Use one extra step of indirection and create surrogates with + # unichr. Not using this indirection would introduce an illegal + # unicode literal on platforms not supporting such lone + # surrogates. + invalid_unicode_re = re.compile(invalid_unicode_no_surrogate + + eval('"\\uD800-\\uDFFF"')) +else: + invalid_unicode_re = re.compile(invalid_unicode_no_surrogate) non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, @@ -164,13 +175,18 @@ def __init__(self, source): """ - # Craziness - if len("\U0010FFFF") == 1: + if not utils.supports_lone_surrogates: + # Such platforms will have already checked for such + # surrogate errors, so no need to do this checking. + self.reportCharacterErrors = None + self.replaceCharactersRegexp = None + elif len("\U0010FFFF") == 1: self.reportCharacterErrors = self.characterErrorsUCS4 - self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]") + self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"')) else: self.reportCharacterErrors = self.characterErrorsUCS2 - self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?/ + (?P[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+) + # Match any character set and encoding + (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?) + |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?) + # Assume the rest is data + ,.* + $ + ''', + re.VERBOSE) + + class HTMLSanitizerMixin(object): """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" @@ -100,8 +115,8 @@ class HTMLSanitizerMixin(object): 'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan'] - attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', - 'xlink:href', 'xml:base'] + attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc', + 'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base'] svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill', 'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', @@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object): acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc', 'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', 'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', - 'ssh', 'sftp', 'rtsp', 'afs'] + 'ssh', 'sftp', 'rtsp', 'afs', 'data'] + + acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain'] # subclasses may define their own versions of these constants allowed_elements = acceptable_elements + mathml_elements + svg_elements @@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object): allowed_css_keywords = acceptable_css_keywords allowed_svg_properties = acceptable_svg_properties allowed_protocols = acceptable_protocols + allowed_content_types = acceptable_content_types # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style @@ -189,10 +207,17 @@ def allowed_token(self, token, token_type): unescape(attrs[attr])).lower() # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") - if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and - (val_unescaped.split(':')[0] not in - self.allowed_protocols)): - del attrs[attr] + uri = urlparse.urlparse(val_unescaped) + if uri and uri.scheme: + if uri.scheme not in self.allowed_protocols: + del attrs[attr] + if uri.scheme == 'data': + m = content_type_rgx.match(uri.path) + if not m: + del attrs[attr] + elif m.group('content_type') not in self.allowed_content_types: + del attrs[attr] + for attr in self.svg_attr_val_allows_ref: if attr in attrs: attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', @@ -245,7 +270,7 @@ def sanitize_css(self, style): elif prop.split('-')[0].lower() in ['background', 'border', 'margin', 'padding']: for keyword in value.split(): - if not keyword in self.acceptable_css_keywords and \ + if keyword not in self.acceptable_css_keywords and \ not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): break else: diff --git a/lib/html5lib/serializer/htmlserializer.py b/lib/html5lib/serializer/htmlserializer.py index 412a5a220..be4d63441 100644 --- a/lib/html5lib/serializer/htmlserializer.py +++ b/lib/html5lib/serializer/htmlserializer.py @@ -1,9 +1,6 @@ from __future__ import absolute_import, division, unicode_literals from six import text_type -import gettext -_ = gettext.gettext - try: from functools import reduce except ImportError: @@ -35,7 +32,7 @@ v = utils.surrogatePairToCodepoint(v) else: v = ord(v) - if not v in encode_entity_map or k.islower(): + if v not in encode_entity_map or k.islower(): # prefer < over < and similarly for &, >, etc. encode_entity_map[v] = k @@ -208,7 +205,7 @@ def serialize(self, treewalker, encoding=None): if token["systemId"]: if token["systemId"].find('"') >= 0: if token["systemId"].find("'") >= 0: - self.serializeError(_("System identifer contains both single and double quote characters")) + self.serializeError("System identifer contains both single and double quote characters") quote_char = "'" else: quote_char = '"' @@ -220,7 +217,7 @@ def serialize(self, treewalker, encoding=None): elif type in ("Characters", "SpaceCharacters"): if type == "SpaceCharacters" or in_cdata: if in_cdata and token["data"].find("= 0: - self.serializeError(_("Unexpected " % name) elif type == "Comment": data = token["data"] if data.find("--") >= 0: - self.serializeError(_("Comment contains --")) + self.serializeError("Comment contains --") yield self.encodeStrict("" % token["data"]) elif type == "Entity": name = token["name"] key = name + ";" - if not key in entities: - self.serializeError(_("Entity %s not recognized" % name)) + if key not in entities: + self.serializeError("Entity %s not recognized" % name) if self.resolve_entities and key not in xmlEntities: data = entities[key] else: diff --git a/lib/html5lib/treebuilders/dom.py b/lib/html5lib/treebuilders/dom.py index 61e5ed79e..234233b79 100644 --- a/lib/html5lib/treebuilders/dom.py +++ b/lib/html5lib/treebuilders/dom.py @@ -158,7 +158,7 @@ def insertText(self, data, parent=None): else: # HACK: allow text nodes as children of the document node if hasattr(self.dom, '_child_node_types'): - if not Node.TEXT_NODE in self.dom._child_node_types: + if Node.TEXT_NODE not in self.dom._child_node_types: self.dom._child_node_types = list(self.dom._child_node_types) self.dom._child_node_types.append(Node.TEXT_NODE) self.dom.appendChild(self.dom.createTextNode(data)) diff --git a/lib/html5lib/treewalkers/__init__.py b/lib/html5lib/treewalkers/__init__.py index 18124e75f..20b91b114 100644 --- a/lib/html5lib/treewalkers/__init__.py +++ b/lib/html5lib/treewalkers/__init__.py @@ -10,8 +10,12 @@ from __future__ import absolute_import, division, unicode_literals +__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree", + "pulldom"] + import sys +from .. import constants from ..utils import default_etree treeWalkerCache = {} @@ -55,3 +59,89 @@ def getTreeWalker(treeType, implementation=None, **kwargs): # XXX: NEVER cache here, caching is done in the etree submodule return etree.getETreeModule(implementation, **kwargs).TreeWalker return treeWalkerCache.get(treeType) + + +def concatenateCharacterTokens(tokens): + pendingCharacters = [] + for token in tokens: + type = token["type"] + if type in ("Characters", "SpaceCharacters"): + pendingCharacters.append(token["data"]) + else: + if pendingCharacters: + yield {"type": "Characters", "data": "".join(pendingCharacters)} + pendingCharacters = [] + yield token + if pendingCharacters: + yield {"type": "Characters", "data": "".join(pendingCharacters)} + + +def pprint(walker): + """Pretty printer for tree walkers""" + output = [] + indent = 0 + for token in concatenateCharacterTokens(walker): + type = token["type"] + if type in ("StartTag", "EmptyTag"): + # tag name + if token["namespace"] and token["namespace"] != constants.namespaces["html"]: + if token["namespace"] in constants.prefixes: + ns = constants.prefixes[token["namespace"]] + else: + ns = token["namespace"] + name = "%s %s" % (ns, token["name"]) + else: + name = token["name"] + output.append("%s<%s>" % (" " * indent, name)) + indent += 2 + # attributes (sorted for consistent ordering) + attrs = token["data"] + for (namespace, localname), value in sorted(attrs.items()): + if namespace: + if namespace in constants.prefixes: + ns = constants.prefixes[namespace] + else: + ns = namespace + name = "%s %s" % (ns, localname) + else: + name = localname + output.append("%s%s=\"%s\"" % (" " * indent, name, value)) + # self-closing + if type == "EmptyTag": + indent -= 2 + + elif type == "EndTag": + indent -= 2 + + elif type == "Comment": + output.append("%s" % (" " * indent, token["data"])) + + elif type == "Doctype": + if token["name"]: + if token["publicId"]: + output.append("""%s""" % + (" " * indent, + token["name"], + token["publicId"], + token["systemId"] if token["systemId"] else "")) + elif token["systemId"]: + output.append("""%s""" % + (" " * indent, + token["name"], + token["systemId"])) + else: + output.append("%s" % (" " * indent, + token["name"])) + else: + output.append("%s" % (" " * indent,)) + + elif type == "Characters": + output.append("%s\"%s\"" % (" " * indent, token["data"])) + + elif type == "SpaceCharacters": + assert False, "concatenateCharacterTokens should have got rid of all Space tokens" + + else: + raise ValueError("Unknown token type, %s" % type) + + return "\n".join(output) diff --git a/lib/html5lib/treewalkers/_base.py b/lib/html5lib/treewalkers/_base.py index 34252e50c..4e11cd020 100644 --- a/lib/html5lib/treewalkers/_base.py +++ b/lib/html5lib/treewalkers/_base.py @@ -1,8 +1,8 @@ from __future__ import absolute_import, division, unicode_literals from six import text_type, string_types -import gettext -_ = gettext.gettext +__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", + "TreeWalker", "NonRecursiveTreeWalker"] from xml.dom import Node @@ -58,7 +58,7 @@ def emptyTag(self, namespace, name, attrs, hasChildren=False): "namespace": to_text(namespace), "data": attrs} if hasChildren: - yield self.error(_("Void element has children")) + yield self.error("Void element has children") def startTag(self, namespace, name, attrs): assert namespace is None or isinstance(namespace, string_types), type(namespace) @@ -122,7 +122,7 @@ def entity(self, name): return {"type": "Entity", "name": text_type(name)} def unknown(self, nodeType): - return self.error(_("Unknown node type: ") + nodeType) + return self.error("Unknown node type: " + nodeType) class NonRecursiveTreeWalker(TreeWalker): diff --git a/lib/html5lib/treewalkers/dom.py b/lib/html5lib/treewalkers/dom.py index a01287a94..ac4dcf31b 100644 --- a/lib/html5lib/treewalkers/dom.py +++ b/lib/html5lib/treewalkers/dom.py @@ -2,9 +2,6 @@ from xml.dom import Node -import gettext -_ = gettext.gettext - from . import _base diff --git a/lib/html5lib/treewalkers/etree.py b/lib/html5lib/treewalkers/etree.py index fd8a9cc9b..69840c21e 100644 --- a/lib/html5lib/treewalkers/etree.py +++ b/lib/html5lib/treewalkers/etree.py @@ -7,12 +7,10 @@ from ordereddict import OrderedDict except ImportError: OrderedDict = dict -import gettext -_ = gettext.gettext import re -from six import text_type +from six import string_types from . import _base from ..utils import moduleFactoryFactory @@ -60,7 +58,7 @@ def getNodeDetails(self, node): return _base.COMMENT, node.text else: - assert type(node.tag) == text_type, type(node.tag) + assert isinstance(node.tag, string_types), type(node.tag) # This is assumed to be an ordinary element match = tag_regexp.match(node.tag) if match: diff --git a/lib/html5lib/treewalkers/lxmletree.py b/lib/html5lib/treewalkers/lxmletree.py index bc934ac05..90e116d38 100644 --- a/lib/html5lib/treewalkers/lxmletree.py +++ b/lib/html5lib/treewalkers/lxmletree.py @@ -4,9 +4,6 @@ from lxml import etree from ..treebuilders.etree import tag_regexp -from gettext import gettext -_ = gettext - from . import _base from .. import ihatexml @@ -130,7 +127,7 @@ def __init__(self, tree): def getNodeDetails(self, node): if isinstance(node, tuple): # Text node node, key = node - assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key return _base.TEXT, ensure_str(getattr(node, key)) elif isinstance(node, Root): @@ -169,7 +166,7 @@ def getNodeDetails(self, node): attrs, len(node) > 0 or node.text) def getFirstChild(self, node): - assert not isinstance(node, tuple), _("Text nodes have no children") + assert not isinstance(node, tuple), "Text nodes have no children" assert len(node) or node.text, "Node has no children" if node.text: @@ -180,7 +177,7 @@ def getFirstChild(self, node): def getNextSibling(self, node): if isinstance(node, tuple): # Text node node, key = node - assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key if key == "text": # XXX: we cannot use a "bool(node) and node[0] or None" construct here # because node[0] might evaluate to False if it has no child element @@ -196,7 +193,7 @@ def getNextSibling(self, node): def getParentNode(self, node): if isinstance(node, tuple): # Text node node, key = node - assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key + assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key if key == "text": return node # else: fallback to "normal" processing diff --git a/lib/html5lib/utils.py b/lib/html5lib/utils.py index 2f41f4dfa..fdc18febb 100644 --- a/lib/html5lib/utils.py +++ b/lib/html5lib/utils.py @@ -2,6 +2,8 @@ from types import ModuleType +from six import text_type + try: import xml.etree.cElementTree as default_etree except ImportError: @@ -9,7 +11,26 @@ __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", - "surrogatePairToCodepoint", "moduleFactoryFactory"] + "surrogatePairToCodepoint", "moduleFactoryFactory", + "supports_lone_surrogates"] + + +# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be +# caught by the below test. In general this would be any platform +# using UTF-16 as its encoding of unicode strings, such as +# Jython. This is because UTF-16 itself is based on the use of such +# surrogates, and there is no mechanism to further escape such +# escapes. +try: + _x = eval('"\\uD800"') + if not isinstance(_x, text_type): + # We need this with u"" because of http://bugs.jython.org/issue2039 + _x = eval('u"\\uD800"') + assert isinstance(_x, text_type) +except: + supports_lone_surrogates = False +else: + supports_lone_surrogates = True class MethodDispatcher(dict):