diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index ed7bc52..057b2cd 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,7 +1,7 @@ --- name: Bug report about: Create a report & testcase to help us improve -title: "\U0001F41B" +title: labels: '' assignees: '' @@ -26,3 +26,7 @@ Example: **Additional context** Add any other context about the problem here. + +
+ Paste Error Traceback here, if any +
diff --git a/CHANGELOG.md b/CHANGELOG.md index cc9e8e1..984e939 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +# v0.3.2 +- 🐛 ✅ Enforce clean=True when doc_type="pdf" - \#75 + # v0.3.1 - 🚑 ✅ Handle Newline character & update tests diff --git a/pysbd/about.py b/pysbd/about.py index 052283f..99369bf 100644 --- a/pysbd/about.py +++ b/pysbd/about.py @@ -2,7 +2,7 @@ # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ __title__ = "pysbd" -__version__ = "0.3.1" +__version__ = "0.3.2" __summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages." __uri__ = "http://nipunsadvilkar.github.io/" __author__ = "Nipun Sadvilkar" diff --git a/pysbd/processor.py b/pysbd/processor.py index df591f1..8cf7737 100644 --- a/pysbd/processor.py +++ b/pysbd/processor.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- import re -from pysbd.utils import Text, TextSpan +from pysbd.utils import Text from pysbd.lists_item_replacer import ListItemReplacer from pysbd.exclamation_words import ExclamationWords from pysbd.between_punctuation import BetweenPunctuation diff --git a/pysbd/segmenter.py b/pysbd/segmenter.py index 98dfc77..dc32d26 100644 --- a/pysbd/segmenter.py +++ b/pysbd/segmenter.py @@ -31,6 +31,15 @@ def __init__(self, language="en", clean=False, doc_type=None, char_span=False): self.clean = clean self.doc_type = doc_type self.char_span = char_span + if self.clean and self.char_span: + raise ValueError("char_span must be False if clean is True. " + "Since `clean=True` will modify original text.") + # when doctype is pdf then force user to clean the text + # char_span func wont be provided with pdf doctype also + elif self.doc_type == 'pdf' and not self.clean: + raise ValueError("`doc_type='pdf'` should have `clean=True` & " + "`char_span` should be False since original" + "text will be modified.") def cleaner(self, text): if hasattr(self.language_module, "Cleaner"): @@ -71,11 +80,10 @@ def segment(self, text): self.original_text = text if not text: return [] - if self.clean and self.char_span: - raise ValueError("char_span must be False if clean is True. " - "Since `clean=True` will modify original text.") - elif self.clean: + + if self.clean or self.doc_type == 'pdf': text = self.cleaner(text).clean() + postprocessed_sents = self.processor(text).process() sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents) if self.char_span: diff --git a/tests/test_segmenter.py b/tests/test_segmenter.py index 6ec64ae..5f7341a 100644 --- a/tests/test_segmenter.py +++ b/tests/test_segmenter.py @@ -68,8 +68,27 @@ def test_exception_with_both_clean_and_span_true(): """ with pytest.raises(ValueError) as e: seg = pysbd.Segmenter(language="en", clean=True, char_span=True) - text = "

Hello

\n

This is a test. Another test.

" - seg.segment(text) + assert str(e.value) == "char_span must be False if clean is True. "\ + "Since `clean=True` will modify original text." + +def test_exception_with_doc_type_pdf_and_clean_false(): + """ + Test to force clean=True when doc_type="pdf" + """ + with pytest.raises(ValueError) as e: + seg = pysbd.Segmenter(language="en", clean=False, doc_type='pdf') + assert str(e.value) == ("`doc_type='pdf'` should have `clean=True` & " + "`char_span` should be False since original" + "text will be modified.") + +def test_exception_with_doc_type_pdf_and_both_clean_char_span_true(): + """ + Test to raise ValueError exception when doc_type="pdf" and + both clean=True and char_span=True + """ + with pytest.raises(ValueError) as e: + seg = pysbd.Segmenter(language="en", clean=True, + doc_type='pdf', char_span=True) assert str(e.value) == "char_span must be False if clean is True. "\ "Since `clean=True` will modify original text."