Skip to content

Commit

Permalink
Merge pull request #4 from xxxsen/xxxsen/feature/add_scrape_source
Browse files Browse the repository at this point in the history
Xxxsen/feature/add scrape source
  • Loading branch information
xxxsen committed Aug 20, 2024
2 parents ae32e2d + 892d9d0 commit 0ed0743
Show file tree
Hide file tree
Showing 13 changed files with 179 additions and 26 deletions.
1 change: 1 addition & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ func defaultConfig() *Config {
"fc2",
"18av",
"freejavbt",
"tktube",
"avsox",
},
Handlers: []string{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package plugin
package parser

import (
"context"
Expand All @@ -9,17 +9,6 @@ import (
"go.uber.org/zap"
)

func DefaultDurationParser(ctx context.Context) decoder.NumberParseFunc {
return func(v string) int64 {
val, err := utils.ToDuration(v)
if err != nil {
logutil.GetLogger(ctx).Error("decode duration failed", zap.Error(err), zap.String("data", v))
return 0
}
return val
}
}

func DefaultReleaseDateParser(ctx context.Context) decoder.NumberParseFunc {
return func(v string) int64 {
val, err := utils.ToTimestamp(v)
Expand Down
45 changes: 45 additions & 0 deletions searcher/parser/duration_parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package parser

import (
"context"
"math"
"strconv"
"strings"
"yamdc/searcher/decoder"
"yamdc/searcher/utils"

"github.com/xxxsen/common/logutil"
"go.uber.org/zap"
)

func DefaultHHMMSSDurationParser(ctx context.Context) decoder.NumberParseFunc {
return func(v string) int64 {
res := strings.Split(v, ":")
if len(res) > 3 {
logutil.GetLogger(ctx).Error("invalid time format", zap.String("data", v))
return 0
}
var sec int64
for i := 0; i < len(res); i++ {
item := strings.TrimSpace(res[len(res)-i-1])
val, err := strconv.ParseInt(item, 10, 60)
if err != nil {
logutil.GetLogger(ctx).Error("invalid time format", zap.String("data", v))
return 0
}
sec += val * int64(math.Pow(60, float64(i)))
}
return sec
}
}

func DefaultDurationParser(ctx context.Context) decoder.NumberParseFunc {
return func(v string) int64 {
val, err := utils.ToDuration(v)
if err != nil {
logutil.GetLogger(ctx).Error("decode duration failed", zap.Error(err), zap.String("data", v))
return 0
}
return val
}
}
24 changes: 24 additions & 0 deletions searcher/parser/duration_parser_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package parser

import (
"context"
"testing"

"github.com/stretchr/testify/assert"
)

type testPair struct {
in string
sec int64
}

func TestHHMMSS(t *testing.T) {
tests := []testPair{
{in: "01 :01: 01", sec: 1*3600 + 60 + 1},
{in: "02: 05", sec: 2*60 + 5},
}
for _, tst := range tests {
out := DefaultHHMMSSDurationParser(context.Background())(tst.in)
assert.Equal(t, tst.sec, out)
}
}
5 changes: 3 additions & 2 deletions searcher/plugin/18av.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"yamdc/model"
"yamdc/number"
"yamdc/searcher/decoder"
"yamdc/searcher/parser"
)

type av18 struct {
Expand Down Expand Up @@ -82,8 +83,8 @@ func (p *av18) OnDecodeHTTPData(ctx *PluginContext, data []byte) (*model.AvMeta,
meta, err := dec.DecodeHTML(data,
decoder.WithCoverParser(p.coverParser),
decoder.WithPlotParser(p.plotParser),
decoder.WithDurationParser(DefaultDurationParser(ctx.GetContext())),
decoder.WithReleaseDateParser(DefaultReleaseDateParser(ctx.GetContext())),
decoder.WithDurationParser(parser.DefaultDurationParser(ctx.GetContext())),
decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx.GetContext())),
)
if err != nil {
return nil, false, err
Expand Down
5 changes: 3 additions & 2 deletions searcher/plugin/avsox.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"yamdc/model"
"yamdc/number"
"yamdc/searcher/decoder"
"yamdc/searcher/parser"
"yamdc/searcher/utils"

"github.com/xxxsen/common/logutil"
Expand Down Expand Up @@ -121,8 +122,8 @@ func (p *avsox) OnDecodeHTTPData(ctx *PluginContext, data []byte) (*model.AvMeta
SampleImageListExpr: "",
}
meta, err := dec.DecodeHTML(data,
decoder.WithReleaseDateParser(DefaultReleaseDateParser(ctx.GetContext())),
decoder.WithDurationParser(DefaultDurationParser(ctx.GetContext())),
decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx.GetContext())),
decoder.WithDurationParser(parser.DefaultDurationParser(ctx.GetContext())),
decoder.WithDefaultStringProcessor(strings.TrimSpace),
)
if err != nil {
Expand Down
1 change: 1 addition & 0 deletions searcher/plugin/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ const (
SSFreeJavBt = "freejavbt"
SSJavDB = "javdb"
SS18AV = "18av"
SSTKTube = "tktube"
)
5 changes: 3 additions & 2 deletions searcher/plugin/freejavbt.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"yamdc/model"
"yamdc/number"
"yamdc/searcher/decoder"
"yamdc/searcher/parser"
putils "yamdc/searcher/utils"
)

Expand Down Expand Up @@ -36,8 +37,8 @@ func (p *freejavbt) OnDecodeHTTPData(ctx *PluginContext, data []byte) (*model.Av
SampleImageListExpr: `//div[@class="preview"]/a/img/@data-src`,
}
res, err := dec.DecodeHTML(data,
decoder.WithDurationParser(DefaultDurationParser(ctx.GetContext())),
decoder.WithReleaseDateParser(DefaultReleaseDateParser(ctx.GetContext())),
decoder.WithDurationParser(parser.DefaultDurationParser(ctx.GetContext())),
decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx.GetContext())),
)
if err != nil {
return nil, false, err
Expand Down
5 changes: 3 additions & 2 deletions searcher/plugin/jav321.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"yamdc/model"
"yamdc/number"
"yamdc/searcher/decoder"
"yamdc/searcher/parser"
putils "yamdc/searcher/utils"
)

Expand Down Expand Up @@ -70,8 +71,8 @@ func (p *jav321) OnDecodeHTTPData(ctx *PluginContext, data []byte) (*model.AvMet
}
rs, err := dec.DecodeHTML(data,
decoder.WithDefaultStringProcessor(p.defaultStringProcessor),
decoder.WithReleaseDateParser(DefaultReleaseDateParser(ctx.GetContext())),
decoder.WithDurationParser(DefaultDurationParser(ctx.GetContext())),
decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx.GetContext())),
decoder.WithDurationParser(parser.DefaultDurationParser(ctx.GetContext())),
)
if err != nil {
return nil, false, err
Expand Down
5 changes: 3 additions & 2 deletions searcher/plugin/javbus.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"yamdc/model"
"yamdc/number"
"yamdc/searcher/decoder"
"yamdc/searcher/parser"
putils "yamdc/searcher/utils"
)

Expand Down Expand Up @@ -52,8 +53,8 @@ func (p *javbus) OnDecodeHTTPData(ctx *PluginContext, data []byte) (*model.AvMet
SampleImageListExpr: `//div[@id="sample-waterfall"]/a[@class="sample-box"]/@href`,
}
rs, err := dec.DecodeHTML(data,
decoder.WithReleaseDateParser(DefaultReleaseDateParser(ctx.GetContext())),
decoder.WithDurationParser(DefaultDurationParser(ctx.GetContext())),
decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx.GetContext())),
decoder.WithDurationParser(parser.DefaultDurationParser(ctx.GetContext())),
)
if err != nil {
return nil, false, err
Expand Down
5 changes: 3 additions & 2 deletions searcher/plugin/javdb.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"yamdc/model"
"yamdc/number"
"yamdc/searcher/decoder"
"yamdc/searcher/parser"
"yamdc/searcher/utils"
)

Expand Down Expand Up @@ -67,8 +68,8 @@ func (p *javdb) OnDecodeHTTPData(ctx *PluginContext, data []byte) (*model.AvMeta
SampleImageListExpr: `//div[@class="tile-images preview-images"]/a[@class="tile-item"]/@href`,
}
meta, err := dec.DecodeHTML(data,
decoder.WithReleaseDateParser(DefaultReleaseDateParser(ctx.GetContext())),
decoder.WithDurationParser(DefaultDurationParser(ctx.GetContext())),
decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx.GetContext())),
decoder.WithDurationParser(parser.DefaultDurationParser(ctx.GetContext())),
)
if err != nil {
return nil, false, err
Expand Down
5 changes: 3 additions & 2 deletions searcher/plugin/javhoo.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"yamdc/model"
"yamdc/number"
"yamdc/searcher/decoder"
"yamdc/searcher/parser"
putils "yamdc/searcher/utils"
)

Expand Down Expand Up @@ -36,8 +37,8 @@ func (p *javhoo) OnDecodeHTTPData(ctx *PluginContext, data []byte) (*model.AvMet
SampleImageListExpr: `//div[@id="sample-box"]/div/a/@href`,
}
meta, err := dec.DecodeHTML(data,
decoder.WithReleaseDateParser(DefaultReleaseDateParser(ctx.GetContext())),
decoder.WithDurationParser(DefaultDurationParser(ctx.GetContext())),
decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx.GetContext())),
decoder.WithDurationParser(parser.DefaultDurationParser(ctx.GetContext())),
)
if err != nil {
return nil, false, err
Expand Down
86 changes: 86 additions & 0 deletions searcher/plugin/tktube.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package plugin

import (
"fmt"
"net/http"
"strings"
"yamdc/model"
"yamdc/number"
"yamdc/searcher/decoder"
"yamdc/searcher/parser"
)

type tktube struct {
DefaultPlugin
}

func (p *tktube) OnPrecheckRequest(ctx *PluginContext, n *number.Number) (bool, error) {
return number.IsFc2(n.GetNumberID()), nil
}

func (p *tktube) OnMakeHTTPRequest(ctx *PluginContext, n *number.Number) (*http.Request, error) {
nid := strings.ReplaceAll(n.GetNumberID(), "-", "--")
ctx.SetKey("number", n.GetNumberID())
uri := fmt.Sprintf("https://tktube.com/zh/search/%s/", nid)
return http.NewRequest(http.MethodGet, uri, nil)
}

func (p *tktube) OnHandleHTTPRequest(ctx *PluginContext, invoker HTTPInvoker, req *http.Request) (*http.Response, error) {
numberId := strings.ToUpper(ctx.GetKeyOrDefault("number", "").(string))
return HandleXPathTwoStepSearch(ctx, invoker, req, &XPathTwoStepContext{
Ps: []*XPathPair{
{
Name: "links",
XPath: `//div[@id="list_videos_videos_list_search_result_items"]/div/a/@href`,
},
{
Name: "names",
XPath: `//div[@id="list_videos_videos_list_search_result_items"]/div/a/strong[@class="title"]/text()`,
},
},
LinkSelector: func(ps []*XPathPair) (string, bool, error) {
links := ps[0].Result
names := ps[1].Result
for i := 0; i < len(links); i++ {
if strings.Contains(strings.ToUpper(names[i]), numberId) {
return links[i], true, nil
}
}
return "", false, nil
},
ValidStatusCode: []int{http.StatusOK},
CheckResultCountMatch: true,
LinkPrefix: "",
})
}

func (p *tktube) OnDecodeHTTPData(ctx *PluginContext, data []byte) (*model.AvMeta, bool, error) {
dec := decoder.XPathHtmlDecoder{
TitleExpr: `//div[@class="headline"]/h1/text()`,
PlotExpr: "",
ActorListExpr: `//div[contains(text(), "女優:")]/a[contains(@href, "models")]/text()`,
ReleaseDateExpr: `//div[@class="item"]/span[contains(text(), "加入日期:")]/em/text()`,
DurationExpr: `//div[@class="item"]/span[contains(text(), "時長:")]/em/text()`,
StudioExpr: "",
LabelExpr: "",
DirectorExpr: "",
SeriesExpr: "",
GenreListExpr: `//div[contains(text(), "標籤:")]/a[contains(@href, "tags")]/text()`,
CoverExpr: `//meta[@property="og:image"]/@content`,
PosterExpr: "",
SampleImageListExpr: "",
}
meta, err := dec.DecodeHTML(data,
decoder.WithDurationParser(parser.DefaultHHMMSSDurationParser(ctx.GetContext())),
decoder.WithReleaseDateParser(parser.DefaultReleaseDateParser(ctx.GetContext())),
)
if err != nil {
return nil, false, err
}
meta.Number = ctx.GetKeyOrDefault("number", "").(string)
return meta, true, nil
}

func init() {
Register(SSTKTube, PluginToCreator(&tktube{}))
}

0 comments on commit 0ed0743

Please sign in to comment.