Skip to content

Commit ccaaa5b

Browse files
committed
refactor text and cleaning tests; add more of them
1 parent 8b4ae25 commit ccaaa5b

File tree

2 files changed

+109
-65
lines changed

2 files changed

+109
-65
lines changed

tests/test_selector.py

-65
Original file line numberDiff line numberDiff line change
@@ -1339,71 +1339,6 @@ def test_set(self) -> None:
13391339
)
13401340

13411341

1342-
class SelectorTextTestCase(unittest.TestCase):
1343-
1344-
sscls = Selector
1345-
1346-
html_body = """
1347-
<body>
1348-
<div class="product">
1349-
<div class="name">Product1</div>
1350-
<span class="price"><b>Price:</b>100</span>
1351-
</div>
1352-
<div class="product">
1353-
<div class="name">Product2</div>
1354-
<span class="price"><b>Price:</b>200</span>
1355-
</div>
1356-
</body>
1357-
"""
1358-
1359-
def test_text_get(self) -> None:
1360-
sel = self.sscls(text="<p>title:<h1>some text</h1></p>")
1361-
txt = sel.get(text=True)
1362-
self.assertEqual(txt, "title:\n\nsome text")
1363-
1364-
def test_text_getall(self) -> None:
1365-
sel = self.sscls(text="<ul><li>option1</li><li>option2</li></ul>").getall(
1366-
text=True
1367-
)
1368-
self.assertEqual(1, len(sel))
1369-
self.assertEqual("option1\noption2", sel[0])
1370-
1371-
def test_text_cleaned_get(self) -> None:
1372-
sel = (
1373-
self.sscls(text="<p>paragraph</p><style>.items</style>")
1374-
.cleaned("html")
1375-
.get(text=True)
1376-
)
1377-
self.assertEqual("paragraph", sel)
1378-
1379-
def test_text_get_guess_punct_space_false(self) -> None:
1380-
sel = self.sscls(text='<p>hello<b>"Folks"</b></p>')
1381-
txt = sel.get(text=True, guess_punct_space=False)
1382-
self.assertEqual(txt, 'hello "Folks"')
1383-
1384-
def test_text_get_guess_layout_false(self) -> None:
1385-
sel = self.sscls(text="<ul><li>option1</li><li>option2</li></ul>")
1386-
txt = sel.get(text=True, guess_layout=False)
1387-
self.assertEqual(txt, "option1 option2")
1388-
1389-
def test_text_get_guess_layout_true(self) -> None:
1390-
sel = self.sscls(text="<ul><li>option1</li><li>option2</li></ul>")
1391-
txt = sel.get(text=True, guess_layout=True)
1392-
self.assertEqual(txt, "option1\noption2")
1393-
1394-
def test_text_css_multiple(self) -> None:
1395-
html = self.sscls(text=self.html_body)
1396-
items = html.css(".product .price").getall(text=True)
1397-
self.assertEqual(items, ["Price: 100", "Price: 200"])
1398-
1399-
def test_text_xpath_get(self) -> None:
1400-
html = self.sscls(text=self.html_body)
1401-
self.assertEqual(
1402-
html.xpath('//div[@class="product"]/span').getall(text=True),
1403-
["Price: 100", "Price: 200"],
1404-
)
1405-
1406-
14071342
class SelectorBytesInput(Selector):
14081343
def __init__(
14091344
self,

tests/test_text_and_cleaning.py

+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
from lxml.html.clean import Cleaner
2+
3+
from parsel import Selector
4+
5+
HTML_BODY = """
6+
<body>
7+
<div class="product">
8+
<div class="name">Product1</div>
9+
<span class="price"><b>Price:</b>100</span>
10+
</div>
11+
<div class="product">
12+
<div class="name">Product2</div>
13+
<span class="price"><b>Price:</b>200</span>
14+
</div>
15+
</body>
16+
"""
17+
18+
19+
def test_text_get() -> None:
20+
sel = Selector("<p>title:<h1>some text</h1></p>")
21+
txt = sel.get(text=True)
22+
assert txt == "title:\n\nsome text"
23+
24+
25+
def test_text_getall() -> None:
26+
sel = Selector("<ul><li>option1</li><li>option2</li></ul>")
27+
28+
assert sel.getall(text=True) == ["option1\noption2"]
29+
assert sel.css("li").getall(text=True) == ["option1", "option2"]
30+
31+
32+
def test_cleaned() -> None:
33+
div_html = "<div><script>SCRIPT</script>" "<style>STYLE</style><p>hello</p><div>"
34+
sel = Selector(div_html)
35+
assert sel.css("script").getall() == ["<script>SCRIPT</script>"]
36+
assert sel.cleaned().css("script").getall() == []
37+
38+
assert len(sel.css("script")) == 1
39+
assert len(sel.css("style")) == 1
40+
assert len(sel.css("p")) == 1
41+
42+
assert len(sel.cleaned().css("script")) == 0
43+
assert len(sel.cleaned().css("style")) == 1
44+
assert len(sel.cleaned().css("p")) == 1
45+
46+
47+
def test_cleaned_options() -> None:
48+
div_html = "<div><script>SCRIPT</script>" "<style>STYLE</style><p>hello</p><div>"
49+
sel = Selector(div_html)
50+
assert len(sel.css("script")) == 1
51+
assert len(sel.css("style")) == 1
52+
assert len(sel.css("p")) == 1
53+
54+
assert len(sel.cleaned().css("script")) == 0
55+
assert len(sel.cleaned().css("style")) == 1
56+
assert len(sel.cleaned().css("p")) == 1
57+
58+
assert len(sel.cleaned("html").css("script")) == 0
59+
assert len(sel.cleaned("html").css("style")) == 1
60+
assert len(sel.cleaned("html").css("p")) == 1
61+
62+
assert len(sel.cleaned("text").css("script")) == 0
63+
assert len(sel.cleaned("text").css("style")) == 0
64+
assert len(sel.cleaned("text").css("p")) == 1
65+
66+
cleaner = Cleaner(kill_tags=["p"], scripts=False, style=False)
67+
assert len(sel.cleaned(cleaner).css("script")) == 1
68+
assert len(sel.cleaned(cleaner).css("style")) == 1
69+
assert len(sel.cleaned(cleaner).css("p")) == 0
70+
71+
72+
def test_get_cleaner() -> None:
73+
div_html = "<div><script>SCRIPT</script><style>STYLE</style><p>P</p></div>"
74+
sel = Selector(div_html)
75+
cleaner = Cleaner(kill_tags=["p"], scripts=False, style=False)
76+
77+
assert sel.get(text=True) == "P"
78+
assert sel.get(text=True, cleaner=None) == "SCRIPT STYLE\n\nP"
79+
assert sel.get(text=True, cleaner="html") == "STYLE\n\nP"
80+
assert sel.get(text=True, cleaner="text") == "P"
81+
assert sel.get(text=True, cleaner=cleaner) == "SCRIPT STYLE"
82+
83+
div = sel.css("div")
84+
assert div.get() == div_html
85+
assert div.get(cleaner=None) == div_html
86+
assert div.get(cleaner="html") == "<div><style>STYLE</style><p>P</p></div>"
87+
assert div.get(cleaner="text") == "<div><p>P</p></div>"
88+
assert (
89+
div.get(cleaner=cleaner)
90+
== "<div><script>SCRIPT</script><style>STYLE</style></div>"
91+
)
92+
93+
94+
def test_guess_punct_space() -> None:
95+
sel = Selector('<p>hello<b>"Folks"</b></p>')
96+
assert sel.get(text=True, guess_punct_space=False) == 'hello "Folks"'
97+
assert sel.get(text=True, guess_punct_space=True) == 'hello"Folks"'
98+
99+
assert sel.getall(text=True, guess_punct_space=False) == ['hello "Folks"']
100+
assert sel.getall(text=True, guess_punct_space=True) == ['hello"Folks"']
101+
102+
103+
def test_guess_layout() -> None:
104+
sel = Selector("<ul><li>option1</li><li>option2</li></ul>")
105+
assert sel.get(text=True, guess_layout=False) == "option1 option2"
106+
assert sel.get(text=True, guess_layout=True) == "option1\noption2"
107+
108+
assert sel.getall(text=True, guess_layout=False) == ["option1 option2"]
109+
assert sel.getall(text=True, guess_layout=True) == ["option1\noption2"]

0 commit comments

Comments
 (0)