diff --git a/README.md b/README.md index 540285d..9a58a9f 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,7 @@ Supported Features `lang()`| ✗ | `last()`| ✓ | `local-name()`| ✓ | +`matches()`| ✓ | `name()`| ✓ | `namespace-uri()`| ✓ | `normalize-space()`| ✓ | diff --git a/assert_test.go b/assert_test.go new file mode 100644 index 0000000..166b76e --- /dev/null +++ b/assert_test.go @@ -0,0 +1,51 @@ +package xpath + +import ( + "reflect" + "testing" +) + +func assertEqual(tb testing.TB, v1, v2 interface{}) { + if !reflect.DeepEqual(v1, v2) { + tb.Fatalf("'%+v' and '%+v' are not equal", v1, v2) + } +} + +func assertNoErr(tb testing.TB, err error) { + if err != nil { + tb.Fatalf("expected no err, but got: %s", err.Error()) + } +} + +func assertErr(tb testing.TB, err error) { + if err == nil { + tb.Fatal("expected err, but got nil") + } +} + +func assertTrue(tb testing.TB, v bool) { + if !v { + tb.Fatal("expected true, but got false") + } +} + +func assertFalse(tb testing.TB, v bool) { + if v { + tb.Fatal("expected false, but got true") + } +} + +func assertNil(tb testing.TB, v interface{}) { + if v != nil && !reflect.ValueOf(v).IsNil() { + tb.Fatalf("expected nil, but got: %+v", v) + } +} + +func assertPanic(t *testing.T, f func()) { + defer func() { + if r := recover(); r == nil { + t.Errorf("The code did not panic") + } + }() + f() +} diff --git a/build.go b/build.go index d526792..2edafb8 100644 --- a/build.go +++ b/build.go @@ -193,8 +193,23 @@ func (b *builder) processFunctionNode(root *functionNode) (query, error) { if err != nil { return nil, err } - qyOutput = &functionQuery{Input: b.firstInput, Func: containsFunc(arg1, arg2)} + case "matches": + //matches(string , pattern) + if len(root.Args) != 2 { + return nil, errors.New("xpath: matches function must have two parameters") + } + var ( + arg1, arg2 query + err error + ) + if arg1, err = b.processNode(root.Args[0]); err != nil { + return nil, err + } + if arg2, err = b.processNode(root.Args[1]); err != nil { + return nil, err + } + qyOutput = &functionQuery{Input: b.firstInput, Func: matchesFunc(arg1, arg2)} case "substring": //substring( string , start [, length] ) if len(root.Args) < 2 { diff --git a/cache.go b/cache.go new file mode 100644 index 0000000..31a2b33 --- /dev/null +++ b/cache.go @@ -0,0 +1,80 @@ +package xpath + +import ( + "regexp" + "sync" +) + +type loadFunc func(key interface{}) (interface{}, error) + +const ( + defaultCap = 65536 +) + +// The reason we're building a simple capacity-resetting loading cache (when capacity reached) instead of using +// something like github.com/hashicorp/golang-lru is primarily due to (not wanting to create) external dependency. +// Currently this library has 0 external dep (other than go sdk), and supports go 1.6, 1.9, and 1.10 (and later). +// Creating external lib dependencies (plus their transitive dependencies) would make things hard if not impossible. +// We expect under most circumstances, the defaultCap is big enough for any long running services that use this +// library if their xpath regexp cardinality is low. However, in extreme cases when the capacity is reached, we +// simply reset the cache, taking a small subsequent perf hit (next to nothing considering amortization) in trade +// of more complex and less performant LRU type of construct. +type loadingCache struct { + sync.RWMutex + cap int + load loadFunc + m map[interface{}]interface{} + reset int +} + +// NewLoadingCache creates a new instance of a loading cache with capacity. Capacity must be >= 0, or +// it will panic. Capacity == 0 means the cache growth is unbounded. +func NewLoadingCache(load loadFunc, capacity int) *loadingCache { + if capacity < 0 { + panic("capacity must be >= 0") + } + return &loadingCache{cap: capacity, load: load, m: make(map[interface{}]interface{})} +} + +func (c *loadingCache) get(key interface{}) (interface{}, error) { + c.RLock() + v, found := c.m[key] + c.RUnlock() + if found { + return v, nil + } + v, err := c.load(key) + if err != nil { + return nil, err + } + c.Lock() + if c.cap > 0 && len(c.m) >= c.cap { + c.m = map[interface{}]interface{}{key: v} + c.reset++ + } else { + c.m[key] = v + } + c.Unlock() + return v, nil +} + +var ( + // RegexpCache is a loading cache for string -> *regexp.Regexp mapping. It is exported so that in rare cases + // client can customize load func and/or capacity. + RegexpCache = defaultRegexpCache() +) + +func defaultRegexpCache() *loadingCache { + return NewLoadingCache( + func(key interface{}) (interface{}, error) { + return regexp.Compile(key.(string)) + }, defaultCap) +} + +func getRegexp(pattern string) (*regexp.Regexp, error) { + exp, err := RegexpCache.get(pattern) + if err != nil { + return nil, err + } + return exp.(*regexp.Regexp), nil +} diff --git a/cache_test.go b/cache_test.go new file mode 100644 index 0000000..665bcd9 --- /dev/null +++ b/cache_test.go @@ -0,0 +1,166 @@ +package xpath + +import ( + "errors" + "fmt" + "math/rand" + "strconv" + "sync" + "testing" +) + +func TestLoadingCache(t *testing.T) { + c := NewLoadingCache( + func(key interface{}) (interface{}, error) { + switch v := key.(type) { + case int: + return strconv.Itoa(v), nil + default: + return nil, errors.New("invalid type") + } + }, + 2) // cap = 2 + assertEqual(t, 0, len(c.m)) + v, err := c.get(1) + assertNoErr(t, err) + assertEqual(t, "1", v) + assertEqual(t, 1, len(c.m)) + + v, err = c.get(1) + assertNoErr(t, err) + assertEqual(t, "1", v) + assertEqual(t, 1, len(c.m)) + + v, err = c.get(2) + assertNoErr(t, err) + assertEqual(t, "2", v) + assertEqual(t, 2, len(c.m)) + + // over capacity, m is reset + v, err = c.get(3) + assertNoErr(t, err) + assertEqual(t, "3", v) + assertEqual(t, 1, len(c.m)) + + // Invalid capacity + assertPanic(t, func() { + NewLoadingCache(func(key interface{}) (interface{}, error) { return key, nil }, -1) + }) + + // Loading failure + c = NewLoadingCache( + func(key interface{}) (interface{}, error) { + if key.(int)%2 == 0 { + return key, nil + } else { + return nil, fmt.Errorf("artificial error: %d", key.(int)) + } + }, 0) + v, err = c.get(12) + assertNoErr(t, err) + assertEqual(t, 12, v) + _, err = c.get(21) + assertErr(t, err) + assertEqual(t, "artificial error: 21", err.Error()) +} + +const ( + benchLoadingCacheRandSeed = 12345 + benchLoadingCacheConcurrency = 5 + benchLoadingCacheKeyRange = 2000 + benchLoadingCacheCap = 1000 +) + +func BenchmarkLoadingCacheCapped_SingleThread(b *testing.B) { + rand.Seed(benchLoadingCacheRandSeed) + c := NewLoadingCache( + func(key interface{}) (interface{}, error) { + return key, nil + }, benchLoadingCacheCap) + for i := 0; i < b.N; i++ { + k := rand.Intn(benchLoadingCacheKeyRange) + v, _ := c.get(k) + if k != v { + b.FailNow() + } + } + b.Logf("N=%d, reset=%d", b.N, c.reset) +} + +func BenchmarkLoadingCacheCapped_MultiThread(b *testing.B) { + rand.Seed(benchLoadingCacheRandSeed) + c := NewLoadingCache( + func(key interface{}) (interface{}, error) { + return key, nil + }, benchLoadingCacheCap) + wg := sync.WaitGroup{} + wg.Add(benchLoadingCacheConcurrency) + for i := 0; i < benchLoadingCacheConcurrency; i++ { + go func() { + for j := 0; j < b.N; j++ { + k := rand.Intn(benchLoadingCacheKeyRange) + v, _ := c.get(k) + if k != v { + b.FailNow() + } + } + defer wg.Done() + }() + } + wg.Wait() + b.Logf("N=%d, concurrency=%d, reset=%d", b.N, benchLoadingCacheConcurrency, c.reset) +} + +func BenchmarkLoadingCacheNoCap_SingleThread(b *testing.B) { + rand.Seed(benchLoadingCacheRandSeed) + c := NewLoadingCache( + func(key interface{}) (interface{}, error) { + return key, nil + }, 0) // 0 => no cap + for i := 0; i < b.N; i++ { + k := rand.Intn(benchLoadingCacheKeyRange) + v, _ := c.get(k) + if k != v { + b.FailNow() + } + } + b.Logf("N=%d, reset=%d", b.N, c.reset) +} + +func BenchmarkLoadingCacheNoCap_MultiThread(b *testing.B) { + rand.Seed(benchLoadingCacheRandSeed) + c := NewLoadingCache( + func(key interface{}) (interface{}, error) { + return key, nil + }, 0) // 0 => no cap + wg := sync.WaitGroup{} + wg.Add(benchLoadingCacheConcurrency) + for i := 0; i < benchLoadingCacheConcurrency; i++ { + go func() { + for j := 0; j < b.N; j++ { + k := rand.Intn(benchLoadingCacheKeyRange) + v, _ := c.get(k) + if k != v { + b.FailNow() + } + } + defer wg.Done() + }() + } + wg.Wait() + b.Logf("N=%d, concurrency=%d, reset=%d", b.N, benchLoadingCacheConcurrency, c.reset) +} + +func TestGetRegexp(t *testing.T) { + RegexpCache = defaultRegexpCache() + assertEqual(t, 0, len(RegexpCache.m)) + assertEqual(t, defaultCap, RegexpCache.cap) + exp, err := getRegexp("^[0-9]{3,5}$") + assertNoErr(t, err) + assertTrue(t, exp.MatchString("3141")) + assertFalse(t, exp.MatchString("3")) + exp, err = getRegexp("[invalid") + assertErr(t, err) + assertEqual(t, "error parsing regexp: missing closing ]: `[invalid`", err.Error()) + assertNil(t, exp) +} diff --git a/func.go b/func.go index df2542b..fd4187b 100644 --- a/func.go +++ b/func.go @@ -10,7 +10,7 @@ import ( "unicode" ) -// Defined an interface of stringBuilder that compatible with +// Defined an interface of stringBuilder that compatible with // strings.Builder(go 1.10) and bytes.Buffer(< go 1.10) type stringBuilder interface { WriteRune(r rune) (n int, err error) @@ -354,6 +354,35 @@ func containsFunc(arg1, arg2 query) func(query, iterator) interface{} { } } +// matchesFunc is an XPath function that tests a given string against a regexp pattern. +// Note: does not support https://www.w3.org/TR/xpath-functions-31/#func-matches 3rd optional `flags` argument; if +// needed, directly put flags in the regexp pattern, such as `(?i)^pattern$` for `i` flag. +func matchesFunc(arg1, arg2 query) func(query, iterator) interface{} { + return func(q query, t iterator) interface{} { + var s string + switch typ := functionArgs(arg1).Evaluate(t).(type) { + case string: + s = typ + case query: + node := typ.Select(t) + if node == nil { + return "" + } + s = node.Value() + } + var pattern string + var ok bool + if pattern, ok = functionArgs(arg2).Evaluate(t).(string); !ok { + panic(errors.New("matches() function second argument type must be string")) + } + re, err := getRegexp(pattern) + if err != nil { + panic(fmt.Errorf("matches() function second argument is not a valid regexp pattern, err: %s", err.Error())) + } + return re.MatchString(s) + } +} + // normalizespaceFunc is XPath functions normalize-space(string?) func normalizespaceFunc(q query, t iterator) interface{} { var m string diff --git a/xpath_test.go b/xpath_test.go index a864665..c8a174f 100644 --- a/xpath_test.go +++ b/xpath_test.go @@ -225,8 +225,9 @@ func TestFunction(t *testing.T) { testXPath(t, html, "//*[starts-with(name(),'h1')]", "h1") testXPath(t, html, "//*[ends-with(name(),'itle')]", "title") // Head title testXPath2(t, html, "//*[contains(@href,'a')]", 2) - testXPath2(t, html, "//*[starts-with(@href,'/a')]", 2) // a links: `/account`,`/about` - testXPath2(t, html, "//*[ends-with(@href,'t')]", 2) // a links: `/account`,`/about` + testXPath2(t, html, "//*[starts-with(@href,'/a')]", 2) // a links: `/account`,`/about` + testXPath2(t, html, "//*[ends-with(@href,'t')]", 2) // a links: `/account`,`/about` + testXPath2(t, html, "//*[matches(@href,'(?i)^.*OU[A-Z]?T$')]", 2) // a links: `/account`,`/about`. Note use of `(?i)` testXPath3(t, html, "//h1[normalize-space(text())='This is a H1']", selectNode(html, "//h1")) testXPath3(t, html, "//title[substring(.,1)='Hello']", selectNode(html, "//title")) testXPath3(t, html, "//title[substring(text(),1,4)='Hell']", selectNode(html, "//title")) @@ -309,6 +310,12 @@ func TestPanic(t *testing.T) { // contains assertPanic(t, func() { testXPath2(t, html, "//*[contains(0, 0)]", 0) }) assertPanic(t, func() { testXPath2(t, html, "//*[contains(@href, 0)]", 0) }) + // matches + assertPanic(t, func() { testXPath2(t, html, "//*[matches()]", 0) }) // arg len check failure + assertPanic(t, func() { testXPath2(t, html, "//*[matches(substring(), 0)]", 0) }) // first arg processing failure + assertPanic(t, func() { testXPath2(t, html, "//*[matches(@href, substring())]", 0) }) // second arg processing failure + assertPanic(t, func() { testXPath2(t, html, "//*[matches(@href, 0)]", 0) }) // second arg not string + assertPanic(t, func() { testXPath2(t, html, "//*[matches(@href, '[invalid')]", 0) }) // second arg invalid regexp // sum assertPanic(t, func() { testXPath3(t, html, "//title[sum('Hello') = 0]", nil) }) // substring @@ -319,15 +326,6 @@ func TestPanic(t *testing.T) { } -func assertPanic(t *testing.T, f func()) { - defer func() { - if r := recover(); r == nil { - t.Errorf("The code did not panic") - } - }() - f() -} - func TestEvaluate(t *testing.T) { testEval(t, html, "count(//ul/li)", float64(4)) testEval(t, html, "//html/@lang", []string{"en"})