Skip to content

Commit

Permalink
Benchmarking and optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
joepie91 committed Jun 28, 2014
1 parent 672f649 commit 9203d83
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 6 deletions.
7 changes: 6 additions & 1 deletion pwhois
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,12 @@ else:
if key in contact_data and contact_data[key] is not None:
label = " " + value + (" " * (widest_label - len(value))) + " :"
if sys.version_info < (3, 0):
actual_data = unicode(contact_data[key])
if type(contact_data[key]) == str:
actual_data = contact_data[key].decode("utf-8")
elif type(contact_data[key]) == datetime.datetime:
actual_data = unicode(contact_data[key])
else:
actual_data = contact_data[key]
else:
actual_data = str(contact_data[key])
if "\n" in actual_data: # Indent multi-line values properly
Expand Down
35 changes: 31 additions & 4 deletions pythonwhois/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def read_dataset(filename, destination, abbrev_key, name_key, is_dict=False):
read_dataset("states_us.dat", states_us, "abbreviation", "name", is_dict=True)
read_dataset("states_ca.dat", states_ca, "abbreviation", "name", is_dict=True)

def precompile_regexes(source, flags=0):
return [re.compile(regex, flags) for regex in source]

grammar = {
"_data": {
'id': ['Domain ID:[ ]*(?P<val>.+)'],
Expand Down Expand Up @@ -389,6 +392,30 @@ def preprocess_regex(regex):
r"\ss\.?a\.?r\.?l\.?($|\s)",
)

grammar["_data"]["id"] = precompile_regexes(grammar["_data"]["id"], re.IGNORECASE)
grammar["_data"]["status"] = precompile_regexes(grammar["_data"]["status"], re.IGNORECASE)
grammar["_data"]["creation_date"] = precompile_regexes(grammar["_data"]["creation_date"], re.IGNORECASE)
grammar["_data"]["expiration_date"] = precompile_regexes(grammar["_data"]["expiration_date"], re.IGNORECASE)
grammar["_data"]["updated_date"] = precompile_regexes(grammar["_data"]["updated_date"], re.IGNORECASE)
grammar["_data"]["registrar"] = precompile_regexes(grammar["_data"]["registrar"], re.IGNORECASE)
grammar["_data"]["whois_server"] = precompile_regexes(grammar["_data"]["whois_server"], re.IGNORECASE)
grammar["_data"]["nameservers"] = precompile_regexes(grammar["_data"]["nameservers"], re.IGNORECASE)
grammar["_data"]["emails"] = precompile_regexes(grammar["_data"]["emails"], re.IGNORECASE)

grammar["_dateformats"] = precompile_regexes(grammar["_dateformats"], re.IGNORECASE)

registrant_regexes = precompile_regexes(registrant_regexes)
tech_contact_regexes = precompile_regexes(tech_contact_regexes)
billing_contact_regexes = precompile_regexes(billing_contact_regexes)
admin_contact_regexes = precompile_regexes(admin_contact_regexes)
nic_contact_regexes = precompile_regexes(nic_contact_regexes)
organization_regexes = precompile_regexes(organization_regexes, re.IGNORECASE)

nic_contact_references["registrant"] = precompile_regexes(nic_contact_references["registrant"])
nic_contact_references["tech"] = precompile_regexes(nic_contact_references["tech"])
nic_contact_references["admin"] = precompile_regexes(nic_contact_references["admin"])
nic_contact_references["billing"] = precompile_regexes(nic_contact_references["billing"])

if sys.version_info < (3, 0):
def is_string(data):
"""Test for string with support for python 2."""
Expand All @@ -409,7 +436,7 @@ def parse_raw_whois(raw_data, normalized=[], never_query_handles=True, handle_se
if (rule_key in data) == False:
for line in segment.splitlines():
for regex in rule_regexes:
result = re.search(regex, line, re.IGNORECASE)
result = re.search(regex, line)

if result is not None:
val = result.group("val").strip()
Expand Down Expand Up @@ -634,7 +661,7 @@ def normalize_data(data, normalized):
new_lines = []
for i, line in enumerate(lines):
for regex in organization_regexes:
if re.search(regex, line, re.IGNORECASE):
if re.search(regex, line):
new_lines.append(line)
del lines[i]
break
Expand All @@ -650,7 +677,7 @@ def normalize_data(data, normalized):
lines = [x.strip() for x in contact["street"].splitlines()]
if len(lines) > 1:
for regex in organization_regexes:
if re.search(regex, lines[0], re.IGNORECASE):
if re.search(regex, lines[0]):
contact["organization"] = lines[0]
contact["street"] = "\n".join(lines[1:])
break
Expand Down Expand Up @@ -714,7 +741,7 @@ def parse_dates(dates):

for date in dates:
for rule in grammar['_dateformats']:
result = re.match(rule, date, re.IGNORECASE)
result = re.match(rule, date)

if result is not None:
try:
Expand Down
22 changes: 21 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python2

import sys, argparse, os, pythonwhois, json, datetime, codecs
import sys, argparse, os, pythonwhois, json, datetime, codecs, time
import pkgutil
import encodings

Expand Down Expand Up @@ -94,6 +94,8 @@ def recursive_compare(obj1, obj2, chain=[]):
targets.sort()

if args.mode[0] == "run":
times_default = []
times_normalized = []
errors = False
suites = []
for target in targets:
Expand Down Expand Up @@ -134,7 +136,9 @@ def recursive_compare(obj1, obj2, chain=[]):
total = len(suites) * 2
for target, data, target_default, target_normalized in suites:
for normalization in (True, []):
start_time = time.time()
parsed = pythonwhois.parse.parse_raw_whois(data, normalized=normalization)
time_taken = (time.time() - start_time) * 1000 # in ms
parsed = json.loads(encoded_json_dumps(parsed)) # Stupid Unicode hack

if normalization == True:
Expand All @@ -155,6 +159,10 @@ def recursive_compare(obj1, obj2, chain=[]):
sys.stdout.write(OK)
sys.stdout.write(progress_prefix + "%s passed in %s mode.\n" % (target, mode))
sys.stderr.write(ENDC)
if normalization == True:
times_normalized.append(time_taken)
else:
times_default.append(time_taken)
total_passed += 1
else:
sys.stderr.write(FAIL)
Expand All @@ -169,6 +177,18 @@ def recursive_compare(obj1, obj2, chain=[]):
total_failed += 1
done += 1

if len(times_default) > 0:
average_default = int(sum(times_default) / float(len(times_default)))
min_default = min(times_default)
max_default = max(times_default)
sys.stdout.write("Timing in default mode: %dms avg, %dms min, %dms max\n" % (average_default, min_default, max_default))

if len(times_normalized) > 0:
average_normalized = int(sum(times_normalized) / float(len(times_normalized)))
min_normalized = min(times_normalized)
max_normalized = max(times_normalized)
sys.stdout.write("Timing in normalized mode: %dms avg, %dms min, %dms max\n" % (average_normalized, min_normalized, max_normalized))

if total_failed == 0:
sys.stdout.write(OK)
sys.stdout.write("All tests passed!\n")
Expand Down

0 comments on commit 9203d83

Please sign in to comment.