From fc53624cf29422f552374dba2ffc26d178722406 Mon Sep 17 00:00:00 2001 From: jpahm <20374744+jpahm@users.noreply.github.com> Date: Wed, 21 Aug 2024 21:09:53 -0500 Subject: [PATCH] Fix profile scraper, add HEADLESS_MODE env var --- .env.template | 1 + README.md | 2 +- go.mod | 10 +++++----- go.sum | 14 +++++++++++--- scrapers/coursebook.go | 16 ++++++++++++---- scrapers/profiles.go | 12 ++++++------ 6 files changed, 36 insertions(+), 19 deletions(-) diff --git a/.env.template b/.env.template index 23ac8c2..2ead8de 100644 --- a/.env.template +++ b/.env.template @@ -1,6 +1,7 @@ #Scrapers LOGIN_NETID= LOGIN_PASSWORD= +HEADLESS_MODE=false #Uploader MONGODB_URI= \ No newline at end of file diff --git a/README.md b/README.md index f462d14..a944eb2 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Part of [Project Nebula](https://about.utdnebula.com). ### Prerequisites -- Golang 1.19 (or higher) +- Golang 1.23 (or higher) ### Development diff --git a/go.mod b/go.mod index 8137c5c..d6acab6 100644 --- a/go.mod +++ b/go.mod @@ -1,12 +1,12 @@ module github.com/UTDNebula/api-tools -go 1.19 +go 1.23 require ( github.com/PuerkitoBio/goquery v1.8.1 github.com/UTDNebula/nebula-api/api v0.0.0-20240423212728-2ef02f280c6c - github.com/chromedp/cdproto v0.0.0-20240421230201-ab917191657d - github.com/chromedp/chromedp v0.9.5 + github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 + github.com/chromedp/chromedp v0.10.0 github.com/joho/godotenv v1.5.1 go.mongodb.org/mongo-driver v1.15.0 ) @@ -26,7 +26,7 @@ require ( github.com/go-playground/validator/v10 v10.19.0 // indirect github.com/gobwas/httphead v0.1.0 // indirect github.com/gobwas/pool v0.2.1 // indirect - github.com/gobwas/ws v1.3.2 // indirect + github.com/gobwas/ws v1.4.0 // indirect github.com/goccy/go-json v0.10.2 // indirect github.com/golang/snappy v0.0.4 // indirect github.com/gorilla/schema v1.3.0 // indirect @@ -51,7 +51,7 @@ require ( golang.org/x/crypto v0.22.0 // indirect golang.org/x/net v0.24.0 // indirect golang.org/x/sync v0.7.0 // indirect - golang.org/x/sys v0.19.0 // indirect + golang.org/x/sys v0.22.0 // indirect golang.org/x/text v0.14.0 // indirect google.golang.org/protobuf v1.33.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index bbecc99..93abb0c 100644 --- a/go.sum +++ b/go.sum @@ -12,8 +12,12 @@ github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4 github.com/chromedp/cdproto v0.0.0-20240202021202-6d0b6a386732/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= github.com/chromedp/cdproto v0.0.0-20240421230201-ab917191657d h1:x9d0XwRV3aWw1gAZtv0LrI39U+Efjp0mtyXRyikGb9Y= github.com/chromedp/cdproto v0.0.0-20240421230201-ab917191657d/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= +github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 h1:bATMoZLH2QGct1kzDxfmeBUQI/QhQvB0mBrOTct+YlQ= +github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs= github.com/chromedp/chromedp v0.9.5 h1:viASzruPJOiThk7c5bueOUY91jGLJVximoEMGoH93rg= github.com/chromedp/chromedp v0.9.5/go.mod h1:D4I2qONslauw/C7INoCir1BJkSwBYMyZgx8X276z3+Y= +github.com/chromedp/chromedp v0.10.0 h1:bRclRYVpMm/UVD76+1HcRW9eV3l58rFfy7AdBvKab1E= +github.com/chromedp/chromedp v0.10.0/go.mod h1:ei/1ncZIqXX1YnAYDkxhD4gzBgavMEUu7JCKvztdomE= github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic= github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww= github.com/cloudwego/base64x v0.1.3 h1:b5J/l8xolB7dyDTTmhJP2oTs5LdrjyrUFuNxdfq5hAg= @@ -30,6 +34,7 @@ github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg= github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU= github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s= +github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA= github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY= github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY= @@ -40,13 +45,15 @@ github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= -github.com/gobwas/ws v1.3.2 h1:zlnbNHxumkRvfPWgfXu8RBwyNR1x8wh9cf5PTOCqs9Q= github.com/gobwas/ws v1.3.2/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= +github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs= +github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc= github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU= github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/gorilla/schema v1.3.0 h1:rbciOzXAx3IB8stEFnfTwO3sYa6EWlQk79XdyustPDA= github.com/gorilla/schema v1.3.0/go.mod h1:Dg5SSm5PV60mhF2NFaTV1xuYYj8tV8NOPRo4FggUMnM= @@ -141,8 +148,8 @@ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= -golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= @@ -159,6 +166,7 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= diff --git a/scrapers/coursebook.go b/scrapers/coursebook.go index 3287078..ae35278 100644 --- a/scrapers/coursebook.go +++ b/scrapers/coursebook.go @@ -12,6 +12,7 @@ import ( "log" "net/http" "os" + "strconv" "strings" "time" @@ -23,9 +24,15 @@ import ( func initChromeDp() (chromedpCtx context.Context, cancelFnc context.CancelFunc) { log.Printf("Initializing chromedp...") - allocCtx, cancelFnc := chromedp.NewExecAllocator(context.Background()) - chromedpCtx, _ = chromedp.NewContext(allocCtx) - log.Printf("Initialized chromedp!") + headlessEnv, present := os.LookupEnv("HEADLESS_MODE") + doHeadless, _ := strconv.ParseBool(headlessEnv) + if present && doHeadless { + chromedpCtx, cancelFnc = chromedp.NewContext(context.Background()) + log.Printf("Initialized chromedp!") + } else { + allocCtx, _ := chromedp.NewExecAllocator(context.Background()) + chromedpCtx, cancelFnc = chromedp.NewContext(allocCtx) + } return } @@ -50,8 +57,9 @@ func refreshToken(chromedpCtx context.Context) map[string][]string { chromedp.WaitVisible(`form#login-form`), chromedp.SendKeys(`input#netid`, netID), chromedp.SendKeys(`input#password`, password), + chromedp.WaitVisible(`input#login-button`), chromedp.Click(`input#login-button`), - chromedp.WaitVisible(`body`), + //chromedp.WaitVisible(`body`), ) if err != nil { panic(err) diff --git a/scrapers/profiles.go b/scrapers/profiles.go index 867ead5..500edac 100644 --- a/scrapers/profiles.go +++ b/scrapers/profiles.go @@ -58,7 +58,7 @@ func parseList(list []string) (string, schema.Location) { var office schema.Location for _, element := range list { - element = strings.Trim(element, " ") + element = strings.TrimSpace(element) utils.VPrintf("Element is: %s", element) if strings.Contains(element, "-") { phoneNumber = element @@ -168,7 +168,7 @@ func ScrapeProfiles(outDir string) { chromedp.Navigate(link), chromedp.ActionFunc(func(ctx context.Context) error { var text string - err := chromedp.Text("//h2", &text).Do(ctx) + err := chromedp.Text("div.contact_info>h1", &text).Do(ctx) firstName, lastName = parseName(text) return err }), @@ -223,7 +223,7 @@ func ScrapeProfiles(outDir string) { utils.VPrint("Scraping titles...") err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter("//h6", + chromedp.QueryAfter("div.profile-title", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { for _, node := range nodes { tempText := getNodeText(node) @@ -257,11 +257,11 @@ func ScrapeProfiles(outDir string) { utils.VPrint("Scraping list text...") err = chromedp.Run(chromedpCtx, - chromedp.QueryAfter("div.contact_info > div", + chromedp.QueryAfter("div.contact_info>div ~ div", func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { var tempText string - err := chromedp.Text("div.contact_info > div", &tempText).Do(ctx) - texts = strings.Split(tempText, "") + err := chromedp.Text("div.contact_info>div ~ div", &tempText).Do(ctx) + texts = strings.Split(tempText, "\n") return err }, ),