Skip to content

Commit

Permalink
Merge branch 'develop' into astra-scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
jpahm authored Oct 14, 2024
2 parents e4d2501 + 28b3641 commit 3579c4b
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 33 deletions.
2 changes: 1 addition & 1 deletion grade-data/24S.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Subject,Catalog Nbr,Section,A+,A,A-,B+,B,B-,C+,C,C-,D+,D,D-,F,CR,I,NC,W,P,Instructor 1,Instructor 2,Instructor 3,Instructor 4,Instructor 5,Instructor 6
Subject,Catalog Nbr,Section,A+,A,A-,B+,B,B-,C+,C,C-,D+,D,D-,F,CR,I,NC,W,P,Instructor 1,Instructor 2,Instructor 3,Instructor 4,Instructor 5,Instructor 6
ACCT,2301,001,4,6,11,7,7,2,8,6,6,,4,,3,,,,3,,"Zhang, Jieying","Ozel, Naim Bugra","Gu, Dongdi","Zhang, Yang",,
ACCT,2301,002,6,9,16,7,11,1,7,4,3,,1,,,,,,2,,"Ozel, Naim Bugra","Zhang, Jieying","Gu, Dongdi","Zhang, Yang",,
ACCT,2301,003,8,11,18,6,6,,2,3,5,,3,,1,,,,4,,"Ozel, Naim Bugra","Zhang, Jieying","Gu, Dongdi","Zhang, Yang",,
Expand Down
16 changes: 8 additions & 8 deletions parser/gradeLoader.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"log"
"os"
"path/filepath"
"strconv"
"strings"
)

Expand Down Expand Up @@ -126,22 +127,21 @@ func csvToMap(csvFile *os.File, logFile *os.File) map[string][]int {

for _, record := range records {
// convert grade distribution from string to int
intSlice := make([]int, 0, 13)
var tempInt int
intSlice := [14]int{}

for j := 0; j < 13; j++ {
fmt.Sscan(record[aPlusCol+j], &tempInt)
intSlice = append(intSlice, tempInt)
intSlice[j], _ = strconv.Atoi(record[aPlusCol+j])
}
// add w number to the grade_distribution slice
if wCol != -1 {
fmt.Sscan(record[wCol], &tempInt)
intSlice[13], _ = strconv.Atoi(record[wCol])
}
intSlice = append(intSlice, tempInt)

// add new grade distribution to map, keyed by SUBJECT + NUMBER + SECTION
distroKey := record[subjectCol] + record[catalogNumberCol] + record[sectionCol]
distroMap[distroKey] = intSlice
// Be sure to trim left padding on section number
trimmedSectionNumber := strings.TrimLeft(record[sectionCol], "0")
distroKey := record[subjectCol] + record[catalogNumberCol] + trimmedSectionNumber
distroMap[distroKey] = intSlice[:]
}
return distroMap
}
2 changes: 1 addition & 1 deletion parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ func parse(path string) {
courseNum := utils.TrimWhitespace(classAndCourseNum[1])

// Figure out the academic session associated with this specific course/Section
session := getAcademicSession(rowInfo, classInfo)
session := getAcademicSession(rowInfo)

// Try to create the course and section based on collected info
courseRef := parseCourse(courseNum, session, rowInfo, classInfo)
Expand Down
8 changes: 6 additions & 2 deletions parser/sectionParser.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ func parseSection(courseRef *schema.Course, classNum string, syllabusURI string,

semesterGrades, exists := GradeMap[session.Name]
if exists {
sectionGrades, exists := semesterGrades[courseRef.Subject_prefix+courseRef.Course_number+section.Section_number]
// We have to trim leading zeroes from the section number in order to match properly, since the grade data does not use leading zeroes
trimmedSectionNumber := strings.TrimLeft(section.Section_number, "0")
// Key into grademap should be uppercased like the grade data
gradeKey := strings.ToUpper(courseRef.Subject_prefix + courseRef.Course_number + trimmedSectionNumber)
sectionGrades, exists := semesterGrades[gradeKey]
if exists {
section.Grade_distribution = sectionGrades
}
Expand All @@ -78,7 +82,7 @@ func parseSection(courseRef *schema.Course, classNum string, syllabusURI string,
var termRegexp *regexp.Regexp = utils.Regexpf(`(?i)Term: (%s)`, utils.R_TERM_CODE)
var datesRegexp *regexp.Regexp = utils.Regexpf(`(?:Start|End)s: (%s)`, utils.R_DATE_MDY)

func getAcademicSession(rowInfo map[string]string, classInfo map[string]string) schema.AcademicSession {
func getAcademicSession(rowInfo map[string]string) schema.AcademicSession {
session := schema.AcademicSession{}
scheduleText := rowInfo["Schedule:"]

Expand Down
66 changes: 48 additions & 18 deletions scrapers/coursebook.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,18 +84,33 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) {
// Get courses for term and prefix, split by grad and undergrad to avoid 300 section cap
for _, clevel := range []string{"clevel_u", "clevel_g"} {
queryStr := fmt.Sprintf("action=search&s%%5B%%5D=term_%s&s%%5B%%5D=%s&s%%5B%%5D=%s", term, coursePrefix, clevel)
req, err := http.NewRequest("POST", "https://coursebook.utdallas.edu/clips/clip-cb11-hat.zog", strings.NewReader(queryStr))
if err != nil {
panic(err)
}
req.Header = coursebookHeaders
res, err := cli.Do(req)

// Try HTTP request, retrying if necessary
res, err := utils.RetryHTTP(func() *http.Request {
req, err := http.NewRequest("POST", "https://coursebook.utdallas.edu/clips/clip-cb11-hat.zog", strings.NewReader(queryStr))
if err != nil {
panic(err)
}
req.Header = coursebookHeaders
return req
}, cli, func(res *http.Response, numRetries int) {
log.Printf("ERROR: Section find for course prefix %s failed! Response code was: %s", coursePrefix, res.Status)
// Wait longer if 3 retries fail; we've probably been IP ratelimited...
if numRetries >= 3 {
log.Printf("WARNING: More than 3 retries have failed. Waiting for 5 minutes before attempting further retries.")
time.Sleep(5 * time.Minute)
} else {
log.Printf("Getting new token and retrying in 3 seconds...")
time.Sleep(3 * time.Second)
}
coursebookHeaders = utils.RefreshToken(chromedpCtx)
// Give coursebook some time to recognize the new token
time.Sleep(500 * time.Millisecond)
})
if err != nil {
panic(err)
}
if res.StatusCode != 200 {
log.Panicf("ERROR: Section find failed! Status was: %s\nIf the status is 404, you've likely been IP ratelimited!", res.Status)
}

buf := bytes.Buffer{}
buf.ReadFrom(res.Body)
courseBuilder.Write(buf.Bytes())
Expand All @@ -116,18 +131,33 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string) {
// Get section info
// Worth noting that the "req" and "div" params in the request below don't actually seem to matter... consider them filler to make sure the request goes through
queryStr := fmt.Sprintf("id=%s&req=0bd73666091d3d1da057c5eeb6ef20a7df3CTp0iTMYFuu9paDeUptMzLYUiW4BIk9i8LIFcBahX2E2b18WWXkUUJ1Y7Xq6j3WZAKPbREfGX7lZY96lI7btfpVS95YAprdJHX9dc5wM=&action=section&div=r-62childcontent", id)
req, err := http.NewRequest("POST", "https://coursebook.utdallas.edu/clips/clip-cb11-hat.zog", strings.NewReader(queryStr))
if err != nil {
panic(err)
}
req.Header = coursebookHeaders
res, err := cli.Do(req)

// Try HTTP request, retrying if necessary
res, err := utils.RetryHTTP(func() *http.Request {
req, err := http.NewRequest("POST", "https://coursebook.utdallas.edu/clips/clip-cb11-hat.zog", strings.NewReader(queryStr))
if err != nil {
panic(err)
}
req.Header = coursebookHeaders
return req
}, cli, func(res *http.Response, numRetries int) {
log.Printf("ERROR: Section id lookup for id %s failed! Response code was: %s", id, res.Status)
// Wait longer if 3 retries fail; we've probably been IP ratelimited...
if numRetries >= 3 {
log.Printf("WARNING: More than 3 retries have failed. Waiting for 5 minutes before attempting further retries.")
time.Sleep(5 * time.Minute)
} else {
log.Printf("Getting new token and retrying in 3 seconds...")
time.Sleep(3 * time.Second)
}
coursebookHeaders = utils.RefreshToken(chromedpCtx)
// Give coursebook some time to recognize the new token
time.Sleep(500 * time.Millisecond)
})
if err != nil {
panic(err)
}
if res.StatusCode != 200 {
log.Panicf("ERROR: Section id lookup for id %s failed! Status was: %s\nIf the status is 404, you've likely been IP ratelimited!", id, res.Status)
}

fptr, err := os.Create(fmt.Sprintf("%s/%s.html", courseDir, id))
if err != nil {
panic(err)
Expand Down
19 changes: 19 additions & 0 deletions utils/methods.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"fmt"
"io/fs"
"log"
"net/http"
"os"
"path/filepath"
"regexp"
Expand Down Expand Up @@ -232,3 +233,21 @@ func GetMapKeys[M ~map[K]V, K comparable, V any](m M) []K {
func Regexpf(format string, vars ...interface{}) *regexp.Regexp {
return regexp.MustCompile(fmt.Sprintf(format, vars...))
}

// Attempts to run the given HTTP request with the given HTTP client, wrapping the request with a retry callback
func RetryHTTP(requestCreator func() *http.Request, client *http.Client, retryCallback func(res *http.Response, numRetries int)) (res *http.Response, err error) {
// Retry loop for requests
numRetries := 0
for {
// Perform HTTP request, retrying if we get a non-200 response code
res, err = client.Do(requestCreator())
// Retry handling
if res.StatusCode != 200 {
retryCallback(res, numRetries)
numRetries++
continue
}
break
}
return res, err
}
6 changes: 3 additions & 3 deletions utils/regexes.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ const R_SUBJECT string = `[A-Z]{2,4}`

// Course code, i.e. 2252.
// The first digit of a course code is the course level, the second digit is the # of credit hours.
const R_COURSE_CODE string = `[0-9v]{4}`
const R_COURSE_CODE string = `[0-9vV]{4}`

// Subject + Course, captured
const R_SUBJ_COURSE_CAP string = `([A-Z]{2,4})\s*([0-9V]{4})`
const R_SUBJ_COURSE_CAP string = `([A-Z]{2,4})\s*([0-9vV]{4})`

// Subject + Course, uncaptured
const R_SUBJ_COURSE string = `[A-Z]{2,4}\s*[0-9V]{4}`
const R_SUBJ_COURSE string = `[A-Z]{2,4}\s*[0-9vV]{4}`

// Section code, i.e. 101
const R_SECTION_CODE string = `[0-9A-z]+`
Expand Down

0 comments on commit 3579c4b

Please sign in to comment.