Skip to content

Commit

Permalink
Merge pull request #77 from nipunsadvilkar/npn-pdf-mode-exceptions
Browse files Browse the repository at this point in the history
  • Loading branch information
nipunsadvilkar authored Sep 11, 2020
2 parents 9069997 + fc61aef commit 91676b8
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 9 deletions.
6 changes: 5 additions & 1 deletion .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
name: Bug report
about: Create a report & testcase to help us improve
title: "\U0001F41B"
title: <Appropriate title>
labels: ''
assignees: ''

Expand All @@ -26,3 +26,7 @@ Example:

**Additional context**
Add any other context about the problem here.

<details>
<summary>Paste Error Traceback here, if any</summary>
<details>
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# v0.3.2
- 🐛 ✅ Enforce clean=True when doc_type="pdf" - \#75

# v0.3.1
- 🚑 ✅ Handle Newline character & update tests

Expand Down
2 changes: 1 addition & 1 deletion pysbd/about.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/

__title__ = "pysbd"
__version__ = "0.3.1"
__version__ = "0.3.2"
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
__uri__ = "http://nipunsadvilkar.github.io/"
__author__ = "Nipun Sadvilkar"
Expand Down
2 changes: 1 addition & 1 deletion pysbd/processor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
import re
from pysbd.utils import Text, TextSpan
from pysbd.utils import Text
from pysbd.lists_item_replacer import ListItemReplacer
from pysbd.exclamation_words import ExclamationWords
from pysbd.between_punctuation import BetweenPunctuation
Expand Down
16 changes: 12 additions & 4 deletions pysbd/segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ def __init__(self, language="en", clean=False, doc_type=None, char_span=False):
self.clean = clean
self.doc_type = doc_type
self.char_span = char_span
if self.clean and self.char_span:
raise ValueError("char_span must be False if clean is True. "
"Since `clean=True` will modify original text.")
# when doctype is pdf then force user to clean the text
# char_span func wont be provided with pdf doctype also
elif self.doc_type == 'pdf' and not self.clean:
raise ValueError("`doc_type='pdf'` should have `clean=True` & "
"`char_span` should be False since original"
"text will be modified.")

def cleaner(self, text):
if hasattr(self.language_module, "Cleaner"):
Expand Down Expand Up @@ -71,11 +80,10 @@ def segment(self, text):
self.original_text = text
if not text:
return []
if self.clean and self.char_span:
raise ValueError("char_span must be False if clean is True. "
"Since `clean=True` will modify original text.")
elif self.clean:

if self.clean or self.doc_type == 'pdf':
text = self.cleaner(text).clean()

postprocessed_sents = self.processor(text).process()
sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
if self.char_span:
Expand Down
23 changes: 21 additions & 2 deletions tests/test_segmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,27 @@ def test_exception_with_both_clean_and_span_true():
"""
with pytest.raises(ValueError) as e:
seg = pysbd.Segmenter(language="en", clean=True, char_span=True)
text = "<h2 class=\"lined\">Hello</h2>\n<p>This is a test. Another test.</p>"
seg.segment(text)
assert str(e.value) == "char_span must be False if clean is True. "\
"Since `clean=True` will modify original text."

def test_exception_with_doc_type_pdf_and_clean_false():
"""
Test to force clean=True when doc_type="pdf"
"""
with pytest.raises(ValueError) as e:
seg = pysbd.Segmenter(language="en", clean=False, doc_type='pdf')
assert str(e.value) == ("`doc_type='pdf'` should have `clean=True` & "
"`char_span` should be False since original"
"text will be modified.")

def test_exception_with_doc_type_pdf_and_both_clean_char_span_true():
"""
Test to raise ValueError exception when doc_type="pdf" and
both clean=True and char_span=True
"""
with pytest.raises(ValueError) as e:
seg = pysbd.Segmenter(language="en", clean=True,
doc_type='pdf', char_span=True)
assert str(e.value) == "char_span must be False if clean is True. "\
"Since `clean=True` will modify original text."

Expand Down

0 comments on commit 91676b8

Please sign in to comment.