Merge pull request #77 from nipunsadvilkar/npn-pdf-mode-exceptions

nipunsadvilkar · Sep 11, 2020 · 91676b8 · 91676b8
2 parents 9069997 + fc61aef
commit 91676b8
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 9 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,7 +1,7 @@
 ---
 name: Bug report
 about: Create a report & testcase to help us improve
-title: "\U0001F41B"
+title: <Appropriate title>
 labels: ''
 assignees: ''
 
@@ -26,3 +26,7 @@ Example:
 
 **Additional context**
 Add any other context about the problem here.
+
+<details>
+  <summary>Paste Error Traceback here, if any</summary>
+<details>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,6 @@
+# v0.3.2
+- 🐛 ✅ Enforce clean=True when doc_type="pdf" - \#75
+
 # v0.3.1
 - 🚑 ✅ Handle Newline character & update tests
 

diff --git a/pysbd/about.py b/pysbd/about.py
@@ -2,7 +2,7 @@
 # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
 
 __title__ = "pysbd"
-__version__ = "0.3.1"
+__version__ = "0.3.2"
 __summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
 __uri__ = "http://nipunsadvilkar.github.io/"
 __author__ = "Nipun Sadvilkar"

diff --git a/pysbd/processor.py b/pysbd/processor.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 import re
-from pysbd.utils import Text, TextSpan
+from pysbd.utils import Text
 from pysbd.lists_item_replacer import ListItemReplacer
 from pysbd.exclamation_words import ExclamationWords
 from pysbd.between_punctuation import BetweenPunctuation

diff --git a/pysbd/segmenter.py b/pysbd/segmenter.py
@@ -31,6 +31,15 @@ def __init__(self, language="en", clean=False, doc_type=None, char_span=False):
         self.clean = clean
         self.doc_type = doc_type
         self.char_span = char_span
+        if self.clean and self.char_span:
+            raise ValueError("char_span must be False if clean is True. "
+                             "Since `clean=True` will modify original text.")
+        # when doctype is pdf then force user to clean the text
+        # char_span func wont be provided with pdf doctype also
+        elif self.doc_type == 'pdf' and not self.clean:
+            raise ValueError("`doc_type='pdf'` should have `clean=True` & "
+                            "`char_span` should be False since original"
+                            "text will be modified.")
 
     def cleaner(self, text):
         if hasattr(self.language_module, "Cleaner"):
@@ -71,11 +80,10 @@ def segment(self, text):
         self.original_text = text
         if not text:
             return []
-        if self.clean and self.char_span:
-            raise ValueError("char_span must be False if clean is True. "
-                             "Since `clean=True` will modify original text.")
-        elif self.clean:
+
+        if self.clean or self.doc_type == 'pdf':
             text = self.cleaner(text).clean()
+
         postprocessed_sents = self.processor(text).process()
         sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
         if self.char_span:

diff --git a/tests/test_segmenter.py b/tests/test_segmenter.py
@@ -68,8 +68,27 @@ def test_exception_with_both_clean_and_span_true():
     """
     with pytest.raises(ValueError) as e:
         seg = pysbd.Segmenter(language="en", clean=True, char_span=True)
-        text = "<h2 class=\"lined\">Hello</h2>\n<p>This is a test. Another test.</p>"
-        seg.segment(text)
+    assert str(e.value) == "char_span must be False if clean is True. "\
+                            "Since `clean=True` will modify original text."
+
+def test_exception_with_doc_type_pdf_and_clean_false():
+    """
+    Test to force clean=True when doc_type="pdf"
+    """
+    with pytest.raises(ValueError) as e:
+        seg = pysbd.Segmenter(language="en", clean=False, doc_type='pdf')
+    assert str(e.value) == ("`doc_type='pdf'` should have `clean=True` & "
+                            "`char_span` should be False since original"
+                            "text will be modified.")
+
+def test_exception_with_doc_type_pdf_and_both_clean_char_span_true():
+    """
+    Test to raise ValueError exception when doc_type="pdf" and
+    both clean=True and char_span=True
+    """
+    with pytest.raises(ValueError) as e:
+        seg = pysbd.Segmenter(language="en", clean=True,
+                                doc_type='pdf', char_span=True)
     assert str(e.value) == "char_span must be False if clean is True. "\
                             "Since `clean=True` will modify original text."