diff --git a/README.md b/README.md index fe5b1d5..6d133ce 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ Run the tool by changing directory using `cd` to the `api-tools` directory and r | Command | Description | |---------|-------------| | `./api-tools -parse -astra` | Parses Astra data. | +| `./api-tools -parse -calendar` | Parses calendar data. | | `./api-tools -parse -csv [directory]` | Outputs grade data CSVs (default: `./grade-data`). | | `./api-tools -parse -map` | Parses UTD Map data. | | `./api-tools -parse -mazevo` | Parses Mazevo data. | diff --git a/go.mod b/go.mod index 0e98181..c340771 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/valyala/fastjson v1.6.4 go.mongodb.org/mongo-driver v1.17.3 golang.org/x/net v0.36.0 + gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c ) require ( @@ -57,6 +58,8 @@ require ( github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.17.8 // indirect github.com/klauspost/cpuid/v2 v2.2.9 // indirect + github.com/kr/pretty v0.3.1 // indirect + github.com/kr/text v0.2.0 // indirect github.com/leodido/go-urn v1.4.0 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/mattn/go-isatty v0.0.20 // indirect @@ -65,6 +68,7 @@ require ( github.com/montanaflynn/stats v0.7.1 // indirect github.com/pelletier/go-toml/v2 v2.2.3 // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect + github.com/rogpeppe/go-internal v1.13.1 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.2.12 // indirect github.com/xdg-go/pbkdf2 v1.0.0 // indirect diff --git a/go.sum b/go.sum index 5849533..00de988 100644 --- a/go.sum +++ b/go.sum @@ -52,6 +52,7 @@ github.com/cloudwego/base64x v0.1.5/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJ github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY= github.com/cncf/xds/go v0.0.0-20250121191232-2f005788dc42 h1:Om6kYQYDUk5wWbT0t0q6pvyM49i9XZAv9dDrkDA7gjk= github.com/cncf/xds/go v0.0.0-20250121191232-2f005788dc42/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -121,8 +122,11 @@ github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa02 github.com/klauspost/cpuid/v2 v2.2.9 h1:66ze0taIn2H33fBvCkXuv9BmCwDfafmiIVpKV9kKGuY= github.com/klauspost/cpuid/v2 v2.2.9/go.mod h1:rqkxqrZ1EhYM9G+hXH7YdowN5R5RGN6NK4QwQ3WMXF8= github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo= @@ -144,10 +148,12 @@ github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhA github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M= github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo= github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= diff --git a/main.go b/main.go index 6680c8b..627eadf 100644 --- a/main.go +++ b/main.go @@ -38,8 +38,8 @@ func main() { scrapeProfiles := flag.Bool("profiles", false, "Alongside -scrape, signifies that professor profiles should be scraped.") // Flag for soc scraping scrapeOrganizations := flag.Bool("organizations", false, "Alongside -scrape, signifies that SOC organizations should be scraped.") - // Flag for calendar scraping - scrapeCalendar := flag.Bool("calendar", false, "Alongside -scrape, signifies that calendar should be scraped.") + // Flag for calendar scraping and parsing + calendar := flag.Bool("calendar", false, "Alongside -scrape or -parse, signifies that calendar should be scraped.") // Flag for astra scraping and parsing astra := flag.Bool("astra", false, "Alongside -scrape or -parse, signifies that Astra should be scraped/parsed.") // Flag for mazevo scraping and parsing @@ -106,7 +106,7 @@ func main() { scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume) case *scrapeOrganizations: scrapers.ScrapeOrganizations(*outDir) - case *scrapeCalendar: + case *calendar: scrapers.ScrapeCalendar(*outDir) case *astra: scrapers.ScrapeAstra(*outDir) @@ -119,6 +119,8 @@ func main() { } case *parse: switch { + case *calendar: + parser.ParseCalendar(*inDir, *outDir) case *astra: parser.ParseAstra(*inDir, *outDir) case *mazevo: diff --git a/parser/calendarParser.go b/parser/calendarParser.go new file mode 100644 index 0000000..8f1beb9 --- /dev/null +++ b/parser/calendarParser.go @@ -0,0 +1,241 @@ +package parser + +import ( + "encoding/json" + "fmt" + "log" + "os" + "regexp" + "slices" + "strings" + + "github.com/UTDNebula/api-tools/utils" + "github.com/UTDNebula/nebula-api/api/schema" +) + +// Some events have only the building name, not the abbreviation +// Maps building names to their abbreviations +var buildingAbbreviations = map[string]string{ + "Activity Center": "AB", + "Activity Center Bookstore": "ACB", + "Administration": "AD", + "Edith and Peter O’Donnell Jr. Athenaeum": "APC", + "Edith O'Donnell Arts and Technology Building": "ATC", + "Lloyd V. Berkner Hall": "BE", + "Bioengineering and Sciences Building": "BSB", + "Classroom Building": "CB", + "Callier Center Richardson": "CR", + "Callier Center Addition": "CRA", + "Davidson-Gundy Alumni Center": "DGA", + "Dining Hall West": "DHW", + "Engineering and Computer Science North": "ECSN", + "Engineering and Computer Science South": "ECSS", + "Engineering and Computer Science West": "ECSW", + "Energy Plant": "EP", + "Founders Annex": "FA", + "Facilities Management": "FM", + "Founders North": "FN", + "Founders Building": "FO", + "Cecil H. Green Hall": "GR", + "Karl Hoblitzelle Hall": "HH", + "Erik Jonsson Academic Center": "JO", + "Naveen Jindal School of Management": "JSOM", + "Eugene McDermott Library": "MC", + "Modular Lab 1": "ML1", + "Modular Lab 2": "ML2", + "North Office Building": "NB", + "North Lab": "NL", + "Police": "PD", + "Physics Annex": "PHA", + "Physics Building": "PHY", + "Natural Science and Engineering Research Lab": "RL", + "Research and Operations Center": "ROC", + "Research and Operations Center West": "ROW", + "Service Building": "SB", + "Sciences Building": "SCI", + "Safety and Grounds": "SG", + "Student Learning Center": "SLC", + "Student Services Building Addition": "SSA", + "Student Services Building": "SSB", + "Student Union": "SU", + "Student Union Food Court": "SUFC", + "Synergy Park North": "SPN", + "Synergy Park North 2": "SP2", + "University Theatre": "TH", + "Visitor Center": "VC", + "Waterview Science and Technology Center": "WSTC", + "Andromeda Hall & University Housing Office": "RHA", + "Capella Hall": "RHC", + "Helix Hall": "RHH", + "Sirius Hall": "RHS", + "Vega Hall": "RHV", + "Recreation Center West": "RCW", + "SP/N Gallery": "SP2", +} + +// Valid building abreviations for checking +var validAbbreviations []string = []string{ + "AB", + "ACB", + "AD", + "APC", + "ATC", + "BE", + "BSB", + "CB", + "CR", + "CRA", + "DGA", + "DHW", + "ECSN", + "ECSS", + "ECSW", + "EP", + "FA", + "FM", + "FN", + "FO", + "GR", + "HH", + "JO", + "JSOM", + "MC", + "ML1", + "ML2", + "NB", + "NL", + "PD", + "PHA", + "PHY", + "RL", + "ROC", + "ROW", + "SB", + "SCI", + "SG", + "SLC", + "SSA", + "SSB", + "SU", + "SUFC", + "SPN", + "SP2", + "TH", + "VC", + "WSTC", + "RHA", + "RHC", + "RHH", + "RHS", + "RHV", + "RCW", +} + +func ParseCalendar(inDir string, outDir string) { + + calendarFile, err := os.ReadFile(inDir + "/eventScraped.json") + if err != nil { + panic(err) + } + + var allEvents []schema.Event + + err = json.Unmarshal(calendarFile, &allEvents) + if err != nil { + panic(err) + } + + multiBuildingMap := make(map[string]map[string]map[string][]schema.Event) + + for _, event := range(allEvents) { + + // Get date + dateTime := event.StartTime + dateTimeString := dateTime.String() + date := dateTimeString[:10] + + // Get building and room + location := utils.ConvertFromInterface[string](event.Location) + + // Regexp to match building abbreviations and room numbers + buildingRegexp := regexp.MustCompile(`[A-Z]{2,4}`) + roomRegexp := regexp.MustCompile(`([0-9]{1,2}\.[0-9]{3})([A-Z])?`) + + building := buildingRegexp.FindString(*location) + room := roomRegexp.FindString(*location) + + // buildingRegexp might capture something that isn't a valid building abbreviation (e.g., UTD) + isValidBuilding := slices.Contains(validAbbreviations, building) + + // If location doesn't have building abbreviation or buildingRegexp captured an invalid abbreviation, + // check for the full building name + lowercaseLocation := strings.ToLower(*location) + if building == "" || !isValidBuilding { + for key := range buildingAbbreviations { + if strings.Contains(lowercaseLocation, strings.ToLower(key)) { + building = buildingAbbreviations[key] + isValidBuilding = true + } + } + } + + // If location doesn't have room number, check to see if location included a room + if room == "" && isValidBuilding { + locationParts := strings.SplitN(*location, ",", 2) + if len(locationParts) == 2 { + room = locationParts[1] + } + } + + // If building is still empty string, then location was initally an empty string + // or location was a place off campus + if building == "" { + building = "Other" + } + + // If room is still empty string, then location was initally an empty string, or + // location did not include a room, or location was a place off campus + if room == "" { + room = "Other" + } + + if _, exists := multiBuildingMap[date]; !exists { + multiBuildingMap[date] = make(map[string]map[string][]schema.Event) + } + + if _, exists := multiBuildingMap[date][building]; !exists { + multiBuildingMap[date][building] = make(map[string][]schema.Event) + } + + multiBuildingMap[date][building][room] = append(multiBuildingMap[date][building][room], event) + } + + var result []schema.MultiBuildingEvents[schema.Event] + + for date, buildings := range multiBuildingMap { + var singleBuildings []schema.SingleBuildingEvents[schema.Event] + for building, rooms := range buildings { + var roomEvents []schema.RoomEvents[schema.Event] + for room, events := range rooms { + roomEvents = append(roomEvents, schema.RoomEvents[schema.Event]{ + Room: room, + Events: events, + }) + } + + singleBuildings = append(singleBuildings, schema.SingleBuildingEvents[schema.Event]{ + Building: building, + Rooms: roomEvents, + }) + } + + result = append(result, schema.MultiBuildingEvents[schema.Event]{ + Date: date, + Buildings: singleBuildings, + }) + } + + log.Print("Parsed Calendar!") + + utils.WriteJSON(fmt.Sprintf("%s/events.json", outDir), result) +} \ No newline at end of file diff --git a/scrapers/calendar.go b/scrapers/calendar.go index 4429c2e..7b69d9a 100644 --- a/scrapers/calendar.go +++ b/scrapers/calendar.go @@ -133,7 +133,7 @@ func ScrapeCalendar(outDir string) { log.Printf("Parsed the events of page %d successfully!\n\n", page+1) } - if err := utils.WriteJSON(fmt.Sprintf("%s/events.json", outDir), events); err != nil { + if err := utils.WriteJSON(fmt.Sprintf("%s/eventScraped.json", outDir), events); err != nil { panic(err) } log.Printf("Finished parsing %d events successfully!\n\n", len(events))