Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"log"
"os"
"path/filepath"
"time"

"github.com/UTDNebula/api-tools/utils"
Expand Down Expand Up @@ -54,10 +55,10 @@ func Parse(inDir string, outDir string, csvPath string, skipValidation bool) {
}

// Try to load any existing profile data
loadProfiles(inDir)
loadProfiles(filepath.Join(inDir, "profiles"))

// Find paths of all scraped data
paths := utils.GetAllFilesWithExtension(inDir, ".html")
paths := utils.GetAllFilesWithExtension(filepath.Join(inDir, "coursebook"), ".html")
if !skipValidation {
log.Printf("Parsing and validating %d files...", len(paths))
} else {
Expand Down
59 changes: 46 additions & 13 deletions parser/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,13 @@ func updateTestData() error {
//doesn't do anything since there is no profile data
loadProfiles("")

tempDir, err := os.MkdirTemp("", "testdata-*")
tempResultDir, err := os.MkdirTemp("", "testdata-*")
if err != nil {
log.Fatal(err)
log.Fatalf("Failed to create temporary directory: %v", err)
}
defer os.RemoveAll(tempDir)
defer os.RemoveAll(tempResultDir)

//Fill temp dir with all the test cases and expected values

duplicates := make(map[string]bool)

for i, input := range utils.GetAllFilesWithExtension("testdata", ".html") {
Expand Down Expand Up @@ -188,7 +187,7 @@ func updateTestData() error {
}
classInfo := getClassInfo(doc)

caseDir := filepath.Join(tempDir, fmt.Sprintf("case_%03d", i))
caseDir := filepath.Join(tempResultDir, fmt.Sprintf("case_%03d", i))
if err = os.Mkdir(caseDir, 0777); err != nil {
return fmt.Errorf("failed to create directory: %v", err)
}
Expand Down Expand Up @@ -218,26 +217,53 @@ func updateTestData() error {
clearGlobals()
}

//rerun parser to get Courses.json, Sections.json, Professors.json

//Parse(tempDir, tempDir, "../grade-data", false)
//Grade data isn't work with tests currently
Parse(tempDir, tempDir, "", false)
input, err := createSampleInput()
if err != nil {
return fmt.Errorf("failed to create sample input for Parse: %v", err)
}
defer os.RemoveAll(input)
Parse(input, tempResultDir, "", false)

//overwrite the current test data with the new data
if err := os.RemoveAll("testdata"); err != nil {
return fmt.Errorf("failed to remove testdata: %v", err)
}

if err := os.CopyFS("testdata", os.DirFS(tempDir)); err != nil {
if err := os.CopyFS("testdata", os.DirFS(tempResultDir)); err != nil {
return fmt.Errorf("failed to copy testdata: %v", err)
}

//reset maps to avoid side effects. maybe parser should be an object?
clearGlobals()
return nil
}

func createSampleInput() (string, error) {
tempInputDir, err := os.MkdirTemp("", "input-*")
if err != nil {
log.Fatalf("Failed to create temporary input directory: %v", err)
}

if err = os.Mkdir(filepath.Join(tempInputDir, "coursebook"), 0777); err != nil {
log.Fatalf("Failed to create course book directory in temp intput dir: %v", err)
}
// for future test data
if err = os.Mkdir(filepath.Join(tempInputDir, "profiles"), 0777); err != nil {
log.Fatalf("Failed to create profiles directory in temp intput dir: %v", err)
}

for i, input := range utils.GetAllFilesWithExtension("testdata", ".html") {
data, err := os.ReadFile(input)
if err != nil {
return "", fmt.Errorf("failed to load test data: %v", err)
}
err = os.WriteFile(filepath.Join(tempInputDir, "coursebook", fmt.Sprintf("input%03d.html", i)), data, 0777)
if err != nil {
return "", fmt.Errorf("failed to write test data: %v", err)
}
}
return tempInputDir, nil
}

func clearGlobals() {
Sections = make(map[primitive.ObjectID]*schema.Section)
Courses = make(map[string]*schema.Course)
Expand All @@ -249,8 +275,15 @@ func clearGlobals() {

func TestParse(t *testing.T) {
tempDir := t.TempDir()

input, err := createSampleInput()
if err != nil {
t.Errorf("failed to create sample input for Parse: %v", err)
}
defer os.RemoveAll(input)

// todo fix grade data, csvPath = ./grade-data panics
Parse("testdata", tempDir, "", false)
Parse(input, tempDir, "", false)

OutputCourses, err := unmarshallFile[[]schema.Course](filepath.Join(tempDir, "courses.json"))
if err != nil {
Expand Down
19 changes: 10 additions & 9 deletions scrapers/coursebook.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ var (
)

const (
coursebookDir = "coursebook"

reqThrottle = 400 * time.Millisecond
prefixThrottle = 5 * time.Second
httpTimeout = 10 * time.Second
Expand Down Expand Up @@ -153,7 +155,7 @@ func (s *coursebookScraper) lastCompletePrefix() string {
log.Fatal(err)
}

dir, err := os.ReadDir(filepath.Join(s.outDir, s.term))
dir, err := os.ReadDir(filepath.Join(s.outDir, coursebookDir, s.term))
if err != nil {
log.Fatalf("failed to read output directory: %v", err)
}
Expand All @@ -179,26 +181,25 @@ func (s *coursebookScraper) lastCompletePrefix() string {
return ""
}

// ensurePrefixFolder creates {outDir}/term if it does not exist

// ensurePrefixFolder creates {outDir}/coursebookDir/term if it does not exist
func (s *coursebookScraper) ensureOutputFolder() error {
if err := os.MkdirAll(filepath.Join(s.outDir, s.term), 0755); err != nil {
if err := os.MkdirAll(filepath.Join(s.outDir, coursebookDir, s.term), 0755); err != nil {
return fmt.Errorf("failed to create term forlder: %w", err)
}
return nil
}

// ensurePrefixFolder creates {outDir}/term/prefix if it does not exist
// ensurePrefixFolder creates {outDir}/coursebookDir/term/prefix if it does not exist
func (s *coursebookScraper) ensurePrefixFolder(prefix string) error {
if err := os.MkdirAll(filepath.Join(s.outDir, s.term, prefix), 0755); err != nil {
if err := os.MkdirAll(filepath.Join(s.outDir, coursebookDir, s.term, prefix), 0755); err != nil {
return fmt.Errorf("failed to create folder for %s: %w", prefix, err)
}
return nil
}

// writeSection writes content to file {outDir}/term/prefix/{id}.html
// writeSection writes content to file {outDir}/coursebookDir/term/prefix/{id}.html
func (s *coursebookScraper) writeSection(prefix string, id string, content string) error {
if err := os.WriteFile(filepath.Join(s.outDir, s.term, prefix, id+".html"), []byte(content), 0644); err != nil {
if err := os.WriteFile(filepath.Join(s.outDir, coursebookDir, s.term, prefix, id+".html"), []byte(content), 0644); err != nil {
return fmt.Errorf("failed to write section %s: %w", id, err)
}
return nil
Expand All @@ -219,7 +220,7 @@ func (s *coursebookScraper) getSectionContent(id string) (string, error) {
// getMissingIdsForPrefix calls getSectionIdsForPrefix and filters out the ids that already
// exist in the prefix directory
func (s *coursebookScraper) getMissingIdsForPrefix(prefix string) ([]string, error) {
path := filepath.Join(s.outDir, s.term, prefix)
path := filepath.Join(s.outDir, coursebookDir, s.term, prefix)

sectionIds, err := s.getSectionIdsForPrefix(prefix)
if err != nil {
Expand Down
57 changes: 46 additions & 11 deletions scrapers/profiles.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@ import (
"fmt"
"log"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"

"github.com/chromedp/cdproto/dom"

"github.com/UTDNebula/api-tools/utils"
"github.com/UTDNebula/nebula-api/api/schema"
"github.com/chromedp/cdproto/cdp"
Expand All @@ -23,10 +26,11 @@ import (
"go.mongodb.org/mongo-driver/bson/primitive"
)

const BASE_URL string = "https://profiles.utdallas.edu/browse?page="
const BaseUrl string = "https://profiles.utdallas.edu/browse?page="
const ProfilesDir string = "profiles"

var primaryLocationRegex *regexp.Regexp = regexp.MustCompile(`^(\w+)\s+(\d+\.\d{3}[A-z]?)$`)
var fallbackLocationRegex *regexp.Regexp = regexp.MustCompile(`^([A-z]+)(\d+)\.?(\d{3}[A-z]?)$`)
var primaryLocationRegex = regexp.MustCompile(`^(\w+)\s+(\d+\.\d{3}[A-z]?)$`)
var fallbackLocationRegex = regexp.MustCompile(`^([A-z]+)(\d+)\.?(\d{3}[A-z]?)$`)

func parseLocation(text string) schema.Location {
var building string
Expand Down Expand Up @@ -99,7 +103,7 @@ func getNodeText(node *cdp.Node) string {
func scrapeProfessorLinks(chromedpCtx context.Context) []string {
var pageLinks []*cdp.Node
_, err := chromedp.RunResponse(chromedpCtx,
chromedp.Navigate(BASE_URL+"1"),
chromedp.Navigate(BaseUrl+"1"),
chromedp.QueryAfter(".page-link",
func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error {
pageLinks = nodes
Expand All @@ -119,7 +123,7 @@ func scrapeProfessorLinks(chromedpCtx context.Context) []string {
professorLinks := make([]string, 0, numPages)
for curPage := 1; curPage <= numPages; curPage++ {
_, err := chromedp.RunResponse(chromedpCtx,
chromedp.Navigate(BASE_URL+strconv.Itoa(curPage)),
chromedp.Navigate(BaseUrl+strconv.Itoa(curPage)),
chromedp.QueryAfter("//h5[@class='card-title profile-name']//a",
func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error {
for _, node := range nodes {
Expand All @@ -146,8 +150,8 @@ func ScrapeProfiles(outDir string) {
chromedpCtx, cancel := utils.InitChromeDp()
defer cancel()

err := os.MkdirAll(outDir, 0777)
if err != nil {
resultDir := filepath.Join(outDir, ProfilesDir)
if err := os.MkdirAll(resultDir, 0777); err != nil {
panic(err)
}

Expand All @@ -158,13 +162,24 @@ func ScrapeProfiles(outDir string) {
log.Print("Scraped professor links!")

for _, link := range professorLinks {
utils.VPrint("Scraping name...")

// Navigate to the link and get the names
var firstName, lastName string
html, err := getOuterHtml(chromedpCtx, link)
if err != nil {
log.Fatalf("Failed to scrape link %s: %v", link, err)
}

utils.VPrint("Scraping name...")
name := link[strings.LastIndex(link, "/"):]
if err = os.WriteFile(filepath.Join(resultDir, name+".html"), []byte(html), 0644); err != nil {
log.Fatalf("Failed save html for %s: %v", name, err)
return
}

_, err := chromedp.RunResponse(chromedpCtx,
/// Everything below should be moved to parser

// Navigate to the link and get the names
var firstName, lastName string
_, err = chromedp.RunResponse(chromedpCtx,
chromedp.Navigate(link),
chromedp.ActionFunc(func(ctx context.Context) error {
var text string
Expand Down Expand Up @@ -301,3 +316,23 @@ func ScrapeProfiles(outDir string) {
encoder.Encode(professors)
fptr.Close()
}

func getOuterHtml(chromedpCtx context.Context, url string) (string, error) {
var html string
err := chromedp.Run(chromedpCtx,
chromedp.Navigate(url),
chromedp.ActionFunc(func(ctx context.Context) error {
node, err := dom.GetDocument().Do(ctx)
if err != nil {
return err
}
html, err = dom.GetOuterHTML().WithNodeID(node.NodeID).Do(ctx)
return err
}),
)

if err != nil {
return "", fmt.Errorf("failed to get outerHtml for page %s: %w", url, err)
}
return html, nil
}