Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed .DS_Store
Binary file not shown.
40 changes: 39 additions & 1 deletion src/data/db/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def insert_library(location, address, latitude, longitude):
conn.close()


def insert_printer(location, description, latitude, longitude):
def insert_printer(location, description, labels, latitude, longitude):
"""Insert a printer into the database."""
conn = get_db_connection()
cursor = conn.cursor()
Expand All @@ -44,6 +44,44 @@ def insert_printer(location, description, latitude, longitude):
""",
(location, description, latitude, longitude),
)

# Insert labels into the labels table and get their IDs
label_ids = []
for label in labels:
cursor.execute(
"""
INSERT OR IGNORE INTO labels (label)
VALUES (?)
""",
(label,),
)
cursor.execute(
"""
SELECT id FROM labels WHERE label = ?
""",
(label,),
)
label_id = cursor.fetchone()[0]
label_ids.append(label_id)

# Create entries in the junction table for printer-label relationships
cursor.execute(
"""
SELECT id FROM printers WHERE location = ? AND description = ? AND latitude = ? AND longitude = ?
""",
(location, description, latitude, longitude),
)
printer_id = cursor.fetchone()[0]

# Insert into junction table
for label_id in label_ids:
cursor.execute(
"""
INSERT OR IGNORE INTO printer_labels (printer_id, label_id)
VALUES (?, ?)
""",
(printer_id, label_id),
)

conn.commit()
conn.close()
24 changes: 24 additions & 0 deletions src/data/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def create_tables():
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

#TODO: Remove UNIQUE constraint from location
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS libraries (
Expand Down Expand Up @@ -50,6 +51,29 @@ def create_tables():
)
"""
)

# Table for storing unique labels
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS labels (
id INTEGER PRIMARY KEY AUTOINCREMENT,
label TEXT UNIQUE NOT NULL
)
"""
)

# Junction table for many-to-many relationship between printers and labels
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS printer_labels (
printer_id INTEGER NOT NULL,
label_id INTEGER NOT NULL,
PRIMARY KEY (printer_id, label_id),
FOREIGN KEY (printer_id) REFERENCES printers(id) ON DELETE CASCADE,
FOREIGN KEY (label_id) REFERENCES labels(id) ON DELETE CASCADE
)
"""
)

conn.commit()
conn.close()
Expand Down
276 changes: 254 additions & 22 deletions src/data/scrapers/printers.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,269 @@
import requests
from bs4 import BeautifulSoup
from difflib import get_close_matches # For data scraping
from difflib import SequenceMatcher
import re # For using regex
import unicodedata # Handles text encoding at Unicode level

# URL of the CU Print directory page
URL = "https://www.cornell.edu/about/maps/directory/?layer=CUPrint&caption=%20CU%20Print%20Printers" # Replace with the actual URL
# URL = "https://www.cornell.edu/about/maps/directory/?layer=CUPrint&caption=%20CU%20Print%20Printers" # Replace with the actual URL

def scrape_printers():
# Send a GET request to fetch the HTML content
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')
URL = 'https://www.cornell.edu/about/maps/directory/text-data.cfm?layer=CUPrint&caption=%20CU%20Print%20Printers'

# HTTP headers to mimic a real browser request
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36",
"Referer": 'https://www.cornell.edu/about/maps/directory/',
"X-Requested-With": 'XMLHttpRequest',
"Accept": 'application/json, text/javascript, */*',
}

# Canonical list of Cornell buildings
# NOTE: This list is not exhaustive. Add more buildings as needed...
CANONICAL_BUILDINGS = [
"Akwe:kon",
"Alice Cook House",
"Baker Lab",
"Barton Hall",
"Becker House",
"Breazzano Center",
"Catherwood Library",
"Clark Hall",
"College of Veterinary Medicine",
"Court-Kay-Bauer Hall",
"Dickson",
"Ecology House",
"Flora Rose House",
"Ganedago",
"Hans Bethe House",
"Hollister Hall",
"Ives Hall",
"John Henrik Clarke Africana Library",
"Keeton House",
"Kroch Library",
"Latino Living Center",
"Law Library",
"Lincoln Hall",
"Mann Library",
"Martha Van Rensselaer Hall",
"Mary Donlon Hall",
"Math Library",
"Mews Hall",
"Milstein Hall",
"Morrison Hall",
"Myron Taylor",
"Olin Library",
"Phillips Hall",
"Plant Science",
"RPCC",
"Rand Hall",
"Rhodes Hall",
"Risley Hall",
"Rockefeller Lab",
"Ruth Bader Ginsburg Hall",
"Sage Hall",
"Schwartz Center",
"Sibley Hall",
"Statler Hall",
"Stimson",
"Tjaden Hall",
"Toni Morrison",
"Ujamaa",
"Upson Hall",
"Uris Library",
"Vet Library",
"Warren Hall",
"White Hall",
"Willard Student Center"
]

# Regex helpers
HTML_TAG_RE = re.compile(r"<[^>]+>")
BRACKET_CONTENT_RE = re.compile(r"[\(\[\{].*?[\)\]\}]")
MULTI_SPACE_RE = re.compile(r"\s+")
TRAILING_CAPS_RE = re.compile(r"\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\s*$")

# Used for stripping common label phrases from building names
LABEL_PHRASES_RE = re.compile(
r"""
\bresidents?\s*only\b |
\bstudents?\s*only\b |
\baa\s*&\s*p\b |
\baap\b
""", re.IGNORECASE | re.VERBOSE
)

# Used to identify common variants of labels
LABEL_PATTERNS = {
# --- Access restrictions ---
# Residents Only (singular/plural + optional hyphen + any case)
"Residents Only": re.compile(r"\bresident[s]?[-\s]*only\b", re.IGNORECASE),

# AA&P Students Only (accept AA&P or AAP; allow any junk in-between; optional hyphen)
"AA&P Students Only": re.compile(
r"\b(?:aa\s*&\s*p|aap)\b.*\bstudent[s]?[-\s]*only\b",
re.IGNORECASE
),

# Landscape Architecture Students Only (allow arbitrary whitespace; optional hyphen)
"Landscape Architecture Students Only": re.compile(
r"\blandscape\s+architecture\b.*\bstudent[s]?[-\s]*only\b",
re.IGNORECASE
),

# --- Printer capabilities ---
"Color": re.compile(r"\bcolor\b", re.IGNORECASE),
"Black & White": re.compile(
r"\b(?:black\s*(?:and|&)\s*white|b\s*&\s*w)\b", re.IGNORECASE
),
"Color, Scan, & Copy": re.compile(
r"\bcolor[,/ &]*(scan|copy|print|copying)+\b", re.IGNORECASE
),
}

# Used for stripping residual trailing labels from descriptions
RESIDUAL_TRAILING_LABEL_RE = re.compile(
r"\b(?:resident|residents|student|students|staff|public)\b\s*$",
re.IGNORECASE
)

def _norm(s):
"""
Unicode/HTML/whitespace normalization.
"""
if s is None:
return ""
s = unicodedata.normalize('NFKC', s) # Normalizes unicode text
s = HTML_TAG_RE.sub(" ", s)
s = s.replace("*", " ")
s = BRACKET_CONTENT_RE.sub(" ", s)
s = MULTI_SPACE_RE.sub(" ", s).strip()
return s

def _strip_trailing_allcaps(s):
"""
Remove trailing ALL-CAPS qualifiers (e.g., RESIDENTS ONLY).
"""
return TRAILING_CAPS_RE.sub("", s).strip()

def _pre_clean_for_match(s: str) -> str:
"""
Pre-clean a building name for matching against the canonical list.
"""
s = _norm(s)
s = LABEL_PHRASES_RE.sub(" ", s) # <— removes "Resident(s) only", "AA&P", etc.
s = _strip_trailing_allcaps(s)
s = RESIDUAL_TRAILING_LABEL_RE.sub(" ", s) # <— removes "Resident", "Students", etc.

s = re.sub(r"[^\w\s\-’']", " ", s) # punctuation noise
s = re.sub(r"\s+", " ", s).strip()
return s

def _token_sort(s):
"""
Tokenize a string, sort the tokens, and re-join them.
"""
tokens = s.lower().split()
tokens.sort()
return " ".join(tokens)

def map_building(name, threshold=87):
"""
Map a building name to a canonical building name using fuzzy matching.
"""
if not name:
return None, 0

query = _token_sort(_pre_clean_for_match(name))
canon_token_list = [_token_sort(_pre_clean_for_match(c)) for c in CANONICAL_BUILDINGS]

# Locate the table
table = soup.find("table", {"id": "directoryTable"})
rows = table.find("tbody").find_all("tr")
# Returns a list of the (top-1) closest match to the cleaned name
best = get_close_matches(query, canon_token_list, n=1)

# Extract data
# If no matches (empty list), return the original name and 0
if not best:
return name, 0

# Return the closest match and its similarity score
match = best[0]

# Calculate the similarity score of the match to the original name (for internal use, potential debugging purposes)
index = canon_token_list.index(match)
canon_raw = CANONICAL_BUILDINGS[index]
score = int(SequenceMatcher(None, query, match).ratio() * 100)

# If the score is below the threshold, return the original name instead of the canonical name
return (canon_raw, score) if score >= threshold else (name, score)

def map_labels(text):
"""
Extract label tokens from the description.
"""
if not text:
return text, []

cleaned = _norm(text)
found_labels = []

for canon, pattern in LABEL_PATTERNS.items():
# Search for the pattern in the cleaned text
if pattern.search(cleaned):
found_labels.append(canon)

# Remove the found label from the text to avoid duplicates
cleaned = pattern.sub("", cleaned).strip()

cleaned = re.sub(r"\s+", " ", cleaned).strip()
return cleaned, sorted(set(found_labels))

def fetch_printers_json():
"""
Fetch printer data in JSON format from the CU Print directory endpoint.
"""
resp = requests.get(URL, headers=HEADERS, timeout=20)
resp.raise_for_status()
return resp.json()

def scrape_printers():
"""
Scrape CU Print printer locations from the Cornell directory page.
"""
payload = fetch_printers_json()
data = []
for row in rows:
cols = row.find_all("td")
if len(cols) < 3: # Ensure row has enough columns
continue

# payload['rows'] is a list of lists, where each inner list represents a row of data
for row in payload['rows']:
if len(row) < 3: # Ensure row has enough columns
continue # Skipping row with insufficient columns

# Each row is of the structure ["Building", "Equipment & Location", "Coordinates (Lat, Lng)"]
[raw_building, raw_location, raw_coordinates] = row

# Map raw building name to canonical building name
building, _ = map_building(raw_building)

# Map labels from description to canonical labels
labels = []

_, building_labels = map_labels(raw_building) # Get labels from the building name (e.g., "Residents Only")
remainder, location_labels = map_labels(raw_location) # Get labels from the location description (e.g., "Landscape Architecture Student ONLY")

location_name = cols[0].text.strip()
description = cols[1].text.strip()
# Deduplicate and sort labels
labels += building_labels
labels += location_labels
labels = sorted(set(labels))

# Extract coordinates from the hyperlink <a> tag inside <td>
coordinates_link = cols[2].find("a")
coordinates_string = coordinates_link.text.strip() if coordinates_link else ""
coordinates = [float(x) for x in coordinates_string.split(', ')]
cleaned = re.sub(r"^[\s\-–—:/|]+", "", remainder).strip() # Remove leftover delimiters at the start (like " - ", " / ", ": ", etc.)
description = cleaned # Final cleaned description text (with labels removed) — essentially, remainder of the location description

# Splits coordinates string into a list of floats
coordinates = [float(x) for x in raw_coordinates.split(', ')]

data.append({
"Location": location_name,
"Location": building,
"Description": description,
"Coordinates": coordinates
"Coordinates": coordinates,
"Labels": labels
})
return data

return data
2 changes: 1 addition & 1 deletion src/data/scripts/populate_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def populate_db():
# Insert printers
printers = scrape_printers()
for printer in printers:
insert_printer(printer['Location'], printer['Description'], printer['Coordinates'][0], printer['Coordinates'][1])
insert_printer(printer['Location'], printer['Description'], printers['Labels'], printer['Coordinates'][0], printer['Coordinates'][1])

if __name__ == "__main__":
populate_db()
Loading