diff --git a/README.md b/README.md index 0b9fd49..88ba97a 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,13 @@ # pySBD: Python Sentence Boundary Disambiguation (SBD) -[![Build Status](https://travis-ci.org/nipunsadvilkar/pySBD.svg?branch=master)](https://travis-ci.org/nipunsadvilkar/pySBD) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/nipunsadvilkar/pySBD/blob/master/LICENSE) +[![Build Status](https://travis-ci.org/nipunsadvilkar/pySBD.svg?branch=master)](https://travis-ci.org/nipunsadvilkar/pySBD) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/nipunsadvilkar/pySBD/blob/master/LICENSE) [![PyPi](https://img.shields.io/pypi/v/pysbd?color=blue&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/pysbd) [![GitHub](https://img.shields.io/github/v/release/nipunsadvilkar/pySBD.svg?include_prereleases&logo=github&style=flat)](https://github.com/nipunsadvilkar/pySBD) pySBD - python Sentence Boundary Disambiguation (SBD) - is a rule-based sentence boundary detection module that works out-of-the-box. This project is a direct port of ruby gem - [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter) which provides rule-based sentence boundary detection. +![pysbd_code](artifacts/pysbd_code.png?raw=true "pysbd_code") + ## Install **Python** @@ -25,6 +27,27 @@ print(seg.segment(text)) ``` - Use `pysbd` as a [spaCy](https://spacy.io/usage/processing-pipelines) pipeline component. (recommended)
Please refer to example [pysbd\_as\_spacy\_component.py](https://github.com/nipunsadvilkar/pySBD/blob/master/examples/pysbd_as_spacy_component.py) +- Use pysbd through [entrypoints](https://spacy.io/usage/saving-loading#entry-points-components) + +```python +import spacy +from pysbd.util import PySBDFactory + +nlp = spacy.blank('en') + +# explicitly adding component to pipeline +# (recommended - makes it more readable to tell what's going on) +nlp.add_pipe(PySBDFactory(nlp)) + +# or you can use it implicitly with keyword +# pysbd = nlp.create_pipe('pysbd') +# nlp.add_pipe(pysbd) + +doc = nlp('My name is Jonas E. Smith. Please turn to p. 55.') +print(list(doc.sents)) +# [My name is Jonas E. Smith., Please turn to p. 55.] + +``` ## Contributing diff --git a/artifacts/pysbd_code.png b/artifacts/pysbd_code.png new file mode 100644 index 0000000..42c7aad Binary files /dev/null and b/artifacts/pysbd_code.png differ diff --git a/pysbd/about.py b/pysbd/about.py index 61b049c..df6155b 100644 --- a/pysbd/about.py +++ b/pysbd/about.py @@ -2,7 +2,7 @@ # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ __title__ = "pysbd" -__version__ = "0.2.0" +__version__ = "0.2.1" __summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages." __uri__ = "http://nipunsadvilkar.github.io/" __author__ = "Nipun Sadvilkar" diff --git a/pysbd/utils.py b/pysbd/utils.py index 03b6d5f..9bbd440 100644 --- a/pysbd/utils.py +++ b/pysbd/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- import re +import pysbd class Rule(object): @@ -65,6 +66,26 @@ def __eq__(self, other): return False +class PySBDFactory(object): + """pysbd as a spacy component through entrypoints""" + + def __init__(self, nlp, language='en', clean=False, char_span=True): + self.nlp = nlp + self.seg = pysbd.Segmenter(language=language, clean=clean, + char_span=char_span) + + def __call__(self, doc): + sents_char_spans = self.seg.segment(doc.text) + char_spans = [doc.char_span(sent_span.start, sent_span.end) + for sent_span in sents_char_spans] + start_token_ids = [span[0].idx for span in char_spans if span + is not None] + for token in doc: + token.is_sent_start = (True if token.idx + in start_token_ids else False) + return doc + + if __name__ == "__main__": SubstituteListPeriodRule = Rule('♨', '∯') StdRule = Rule(r'∯', r'∯♨') diff --git a/setup.py b/setup.py index dd3652d..bb234c9 100644 --- a/setup.py +++ b/setup.py @@ -88,4 +88,7 @@ def run(self): cmdclass={ 'upload': UploadCommand, }, + entry_points={ + "spacy_factories": ["pysbd = pysbd.utils:PySBDFactory"] + } )