Skip to content

Commit f3e7ddc

Browse files
Refresh scraper (#460)
Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
1 parent 35d844f commit f3e7ddc

File tree

406 files changed

+254419
-196032
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

406 files changed

+254419
-196032
lines changed

internal/models/schema-recipe.go

+61-11
Original file line numberDiff line numberDiff line change
@@ -986,17 +986,67 @@ func (n *NutritionSchema) UnmarshalJSON(data []byte) error {
986986

987987
switch x := v.(type) {
988988
case map[string]any:
989-
n.Calories = regex.Digit.FindString(strings.ReplaceAll(extensions.ConvertToString(x["calories"]), ",", "."))
990-
n.Carbohydrates = regex.Digit.FindString(strings.ReplaceAll(extensions.ConvertToString(x["carbohydrateContent"]), ",", "."))
991-
n.Cholesterol = regex.Digit.FindString(strings.ReplaceAll(extensions.ConvertToString(x["cholesterolContent"]), ",", "."))
992-
n.Fat = regex.Digit.FindString(strings.ReplaceAll(extensions.ConvertToString(x["fatContent"]), ",", "."))
993-
n.SaturatedFat = regex.Digit.FindString(strings.ReplaceAll(extensions.ConvertToString(x["saturatedFatContent"]), ",", "."))
994-
n.UnsaturatedFat = regex.Digit.FindString(strings.ReplaceAll(extensions.ConvertToString(x["unsaturatedFatContent"]), ",", "."))
995-
n.TransFat = regex.Digit.FindString(strings.ReplaceAll(extensions.ConvertToString(x["transFatContent"]), ",", "."))
996-
n.Protein = regex.Digit.FindString(strings.ReplaceAll(extensions.ConvertToString(x["proteinContent"]), ",", "."))
997-
n.Sugar = regex.Digit.FindString(strings.ReplaceAll(extensions.ConvertToString(x["sugarContent"]), ",", "."))
998-
n.Sodium = regex.Digit.FindString(strings.ReplaceAll(extensions.ConvertToString(x["sodiumContent"]), ",", "."))
999-
n.Fiber = regex.Digit.FindString(strings.ReplaceAll(extensions.ConvertToString(x["fiberContent"]), ",", "."))
989+
extract := func(nutrition string) string {
990+
return regex.Digit.FindString(strings.ReplaceAll(extensions.ConvertToString(x[nutrition]), ",", "."))
991+
}
992+
993+
if _, ok := x["calories"]; ok {
994+
n.Calories = extract("calories")
995+
}
996+
997+
if _, ok := x["carbohydrateContent"]; ok {
998+
n.Carbohydrates = extract("carbohydrateContent")
999+
} else if _, ok = x["carbs"]; ok {
1000+
n.Carbohydrates = extract("carbs")
1001+
}
1002+
1003+
if _, ok := x["cholesterolContent"]; ok {
1004+
n.Cholesterol = extract("cholesterolContent")
1005+
}
1006+
1007+
if _, ok := x["fatContent"]; ok {
1008+
n.Fat = extract("fatContent")
1009+
} else if _, ok = x["fat"]; ok {
1010+
n.Fat = extract("fat")
1011+
}
1012+
1013+
if _, ok := x["saturatedFatContent"]; ok {
1014+
n.SaturatedFat = extract("saturatedFatContent")
1015+
} else if _, ok = x["saturatedFat"]; ok {
1016+
n.SaturatedFat = extract("saturatedFat")
1017+
}
1018+
1019+
if _, ok := x["unsaturatedFatContent"]; ok {
1020+
n.UnsaturatedFat = extract("unsaturatedFatContent")
1021+
}
1022+
1023+
if _, ok := x["transFatContent"]; ok {
1024+
n.TransFat = extract("transFatContent")
1025+
}
1026+
1027+
if _, ok := x["proteinContent"]; ok {
1028+
n.Protein = extract("proteinContent")
1029+
} else if _, ok = x["protein"]; ok {
1030+
n.Protein = extract("protein")
1031+
}
1032+
1033+
if _, ok := x["sugarContent"]; ok {
1034+
n.Sugar = extract("sugarContent")
1035+
} else if _, ok = x["sugars"]; ok {
1036+
n.Sugar = extract("sugars")
1037+
}
1038+
1039+
if _, ok := x["sodiumContent"]; ok {
1040+
n.Sodium = extract("sodiumContent")
1041+
} else if _, ok = x["salt"]; ok {
1042+
n.Sodium = extract("salt")
1043+
}
1044+
1045+
if _, ok := x["fiberContent"]; ok {
1046+
n.Fiber = extract("fiberContent")
1047+
} else if _, ok = x["fibre"]; ok {
1048+
n.Fiber = extract("fibre")
1049+
}
10001050

10011051
if val := extensions.ConvertToString(x["servingSize"]); val != "" {
10021052
xs := strings.Split(val, " ")

internal/scraper/alittlebityummy.go

+4
Original file line numberDiff line numberDiff line change
@@ -30,5 +30,9 @@ func scrapeALittleBitYummy(root *goquery.Document) (models.RecipeSchema, error)
3030
}
3131
})
3232

33+
for i, s := range rs.Instructions.Values {
34+
rs.Instructions.Values[i].Text = strings.Join(strings.Fields(s.Text), " ")
35+
}
36+
3337
return rs, nil
3438
}

internal/scraper/epicurious.go

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package scraper
2+
3+
import (
4+
"github.com/PuerkitoBio/goquery"
5+
"github.com/reaper47/recipya/internal/models"
6+
"strings"
7+
)
8+
9+
func scrapeEpicurious(root *goquery.Document) (models.RecipeSchema, error) {
10+
rs, err := parseWebsite(root)
11+
if err != nil {
12+
return models.RecipeSchema{}, err
13+
}
14+
15+
parseTime := func(s string) string {
16+
t := "PT"
17+
18+
parts := strings.Split(s, " ")
19+
n := len(parts)
20+
21+
if n == 2 {
22+
t += parts[0]
23+
if strings.HasPrefix(strings.ToLower(parts[1]), "min") {
24+
t += "M"
25+
} else {
26+
t += "H"
27+
}
28+
} else if n >= 4 {
29+
t += parts[0] + "H" + parts[2] + "M"
30+
}
31+
32+
return t
33+
}
34+
35+
if rs.CookTime != "" && !strings.HasPrefix(rs.CookTime, "PT") {
36+
rs.CookTime = parseTime(rs.CookTime)
37+
}
38+
39+
if rs.PrepTime != "" && !strings.HasPrefix(rs.PrepTime, "PT") {
40+
rs.PrepTime = parseTime(rs.PrepTime)
41+
}
42+
43+
if rs.TotalTime != "" && !strings.HasPrefix(rs.TotalTime, "PT") {
44+
rs.TotalTime = parseTime(rs.TotalTime)
45+
}
46+
47+
return rs, nil
48+
}

internal/scraper/finedininglovers.go

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package scraper
2+
3+
import (
4+
"github.com/PuerkitoBio/goquery"
5+
"github.com/reaper47/recipya/internal/models"
6+
"strings"
7+
)
8+
9+
func scrapeFineDiningLovers(root *goquery.Document) (models.RecipeSchema, error) {
10+
rs, err := parseWebsite(root)
11+
if err != nil {
12+
return models.RecipeSchema{}, err
13+
}
14+
15+
nodes := root.Find(".field--name-field-srh-ingredients p")
16+
rs.Ingredients.Values = make([]string, 0, nodes.Length())
17+
nodes.Each(func(_ int, sel *goquery.Selection) {
18+
rs.Ingredients.Values = append(rs.Ingredients.Values, strings.TrimSpace(sel.Text()))
19+
})
20+
21+
nodes = root.Find(".paragraph--type-srh-step")
22+
rs.Instructions.Values = make([]models.HowToItem, 0, nodes.Length())
23+
nodes.Each(func(_ int, sel *goquery.Selection) {
24+
s := strings.TrimSpace(sel.Text())
25+
s = strings.ReplaceAll(s, " ", "")
26+
s = strings.ReplaceAll(s, "\u00a0", "\n")
27+
s = strings.ReplaceAll(s, "\n\n\n", "\n\n")
28+
rs.Instructions.Values = append(rs.Instructions.Values, models.NewHowToStep(s))
29+
})
30+
31+
return rs, nil
32+
}

internal/scraper/foodbag.go

-172
This file was deleted.

internal/scraper/gazoakleychef.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ func scrapeGazoakleychef(root *goquery.Document) (models.RecipeSchema, error) {
1313

1414
rs.DateModified = getPropertyContent(root, "article:modified_time")
1515
rs.Image.Value = getPropertyContent(root, "og:image")
16-
rs.Name = root.Find(".entry-title").Text()
16+
rs.Name = root.Find(".entry-header .entry-title").First().Text()
1717

1818
root.Find(".entry-quick-info div.row div").Each(func(_ int, sel *goquery.Selection) {
1919
c := strings.TrimSpace(sel.Text())

internal/scraper/gousto.go

+6-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"github.com/reaper47/recipya/internal/models"
88
"io"
99
"net/http"
10+
"slices"
1011
"strconv"
1112
"strings"
1213
)
@@ -159,7 +160,7 @@ func (s *Scraper) scrapeGousto(rawURL string) (models.RecipeSchema, error) {
159160

160161
entry := g.Data.Entry
161162
rs := models.NewRecipeSchema()
162-
rs.Name = entry.Title
163+
rs.Name = strings.TrimSpace(entry.Title)
163164

164165
if len(entry.Categories) > 0 {
165166
cat := strings.ToLower(entry.Categories[len(entry.Categories)-1].Title)
@@ -174,7 +175,7 @@ func (s *Scraper) scrapeGousto(rawURL string) (models.RecipeSchema, error) {
174175
rs.Image.Value = entry.Media.Images[len(entry.Media.Images)-1].Image
175176
}
176177

177-
rs.Description.Value = entry.Description
178+
rs.Description.Value = strings.TrimSpace(entry.Description)
178179

179180
if entry.PrepTimes.For2 > 0 {
180181
rs.PrepTime = "PT" + strconv.Itoa(entry.PrepTimes.For2) + "M"
@@ -189,6 +190,9 @@ func (s *Scraper) scrapeGousto(rawURL string) (models.RecipeSchema, error) {
189190
for _, basic := range entry.Basics {
190191
rs.Ingredients.Values = append(rs.Ingredients.Values, basic.Title)
191192
}
193+
rs.Ingredients.Values = slices.DeleteFunc(rs.Ingredients.Values, func(s string) bool {
194+
return s == ""
195+
})
192196

193197
rs.Instructions.Values = make([]models.HowToItem, 0, len(entry.CookingInstructions))
194198
for _, ins := range entry.CookingInstructions {

internal/scraper/instantpot.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func scrapeInstantPot(root *goquery.Document) (models.RecipeSchema, error) {
5353
rs.PrepTime = prep
5454
}
5555

56-
getIngredients(&rs, root.Find(".article__ingredients").First().Find("li"))
56+
getIngredients(&rs, root.Find(".article__ingredients").Last().Find("li"))
5757
getInstructions(&rs, root.Find(".article__instructions").First().Find("li"))
5858

5959
return rs, nil

0 commit comments

Comments
 (0)