annotate.go

package sentences

import (
	"strings"
)

/*
AnnotateTokens is an interface used for the sentence tokenizer to add properties to
any given token during tokenization.
*/
type AnnotateTokens interface {
	Annotate([]*Token) []*Token
}

/*
TypeBasedAnnotation performs the first pass of annotation, which makes decisions
based purely based on the word type of each word:
	* '?', '!', and '.' are marked as sentence breaks.
	* sequences of two or more periods are marked as ellipsis.
	* any word ending in '.' that's a known abbreviation is marked as an abbreviation.
	* any other word ending in '.' is marked as a sentence break.

Return these annotations as a tuple of three sets:
	* sentbreak_toks: The indices of all sentence breaks.
	* abbrev_toks: The indices of all abbreviations.
	* ellipsis_toks: The indices of all ellipsis marks.
*/
type TypeBasedAnnotation struct {
	*Storage
	PunctStrings
	TokenExistential
}

// NewTypeBasedAnnotation creates an instance of the TypeBasedAnnotation struct
func NewTypeBasedAnnotation(s *Storage, p PunctStrings, e TokenExistential) *TypeBasedAnnotation {
	return &TypeBasedAnnotation{
		Storage:          s,
		PunctStrings:     p,
		TokenExistential: e,
	}
}

// NewAnnotations is the default AnnotateTokens struct  that the tokenizer uses
func NewAnnotations(s *Storage, p PunctStrings, word WordTokenizer) []AnnotateTokens {
	return []AnnotateTokens{
		&TypeBasedAnnotation{s, p, word},
		&TokenBasedAnnotation{s, p, word, &DefaultTokenGrouper{}, &OrthoContext{
			s, p, word, word,
		}},
	}
}

// Annotate iterates over all tokens and applies the type annotation on them
func (a *TypeBasedAnnotation) Annotate(tokens []*Token) []*Token {
	for _, augTok := range tokens {
		a.typeAnnotation(augTok)
	}
	return tokens
}

func (a *TypeBasedAnnotation) typeAnnotation(token *Token) {
	chars := []rune(token.Tok)

	if a.HasSentEndChars(token) {
		token.SentBreak = true
	} else if a.HasPeriodFinal(token) && !strings.HasSuffix(token.Tok, "..") {
		tokNoPeriod := strings.ToLower(token.Tok[:len(chars)-1])
		tokNoPeriodHypen := strings.Split(tokNoPeriod, "-")
		tokLastHyphEl := string(tokNoPeriodHypen[len(tokNoPeriodHypen)-1])

		if a.IsAbbr(tokNoPeriod, tokLastHyphEl) {
			token.Abbr = true
		} else {
			token.SentBreak = true
		}
	}
}

/*
TokenBasedAnnotation performs a token-based classification (section 4) over the given
tokens, making use of the orthographic heuristic (4.1.1), collocation
heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
*/
type TokenBasedAnnotation struct {
	*Storage
	PunctStrings
	TokenParser
	TokenGrouper
	Ortho
}

// Annotate iterates groups tokens in pairs of two and then iterates over them to apply token annotation
func (a *TokenBasedAnnotation) Annotate(tokens []*Token) []*Token {
	for _, tokPair := range a.TokenGrouper.Group(tokens) {
		a.tokenAnnotation(tokPair[0], tokPair[1])
	}

	return tokens
}

func (a *TokenBasedAnnotation) tokenAnnotation(tokOne, tokTwo *Token) {
	if tokTwo == nil {
		return
	}

	if !a.TokenParser.HasPeriodFinal(tokOne) {
		return
	}

	typ := a.TokenParser.TypeNoPeriod(tokOne)
	nextTyp := a.TokenParser.TypeNoSentPeriod(tokTwo)
	tokIsInitial := a.TokenParser.IsInitial(tokOne)

	/*
	   [4.1.2. Collocation Heuristic] If there's a
	   collocation between the word before and after the
	   period, then label tok as an abbreviation and NOT
	   a sentence break. Note that collocations with
	   frequent sentence starters as their second word are
	   excluded in training.
	*/
	collocation := strings.Join([]string{typ, nextTyp}, ",")
	if a.Collocations[collocation] != 0 {
		tokOne.SentBreak = false
		tokOne.Abbr = true
		return
	}

	/*
		[4.2. Token-Based Reclassification of Abbreviations] If
		the token is an abbreviation or an ellipsis, then decide
		whether we should *also* classify it as a sentbreak.
	*/
	if (tokOne.Abbr || a.TokenParser.IsEllipsis(tokOne)) && !tokIsInitial {
		/*
			[4.1.1. Orthographic Heuristic] Check if there's
			orthogrpahic evidence about whether the next word
			starts a sentence or not.
		*/
		isSentStarter := a.Ortho.Heuristic(tokTwo)
		if isSentStarter == 1 {
			tokOne.SentBreak = true
			return
		}

		/*
			[4.1.3. Frequent Sentence Starter Heruistic] If the
			next word is capitalized, and is a member of the
			frequent-sentence-starters list, then label tok as a
			sentence break.
		*/
		if a.TokenParser.FirstUpper(tokTwo) && a.SentStarters[nextTyp] != 0 {
			tokOne.SentBreak = true
			return
		}
	}

	/*
		Sometimes there are two consecutive tokens with a lone "."
		which probably means it is part of a spaced ellipsis ". . ."
		so set those tokens and not sentence breaks
	*/
	if tokOne.Tok == "." && tokTwo.Tok == "." {
		tokOne.SentBreak = false
		tokTwo.SentBreak = false
		return
	}

	/*
		[4.3. Token-Based Detection of Initials and Ordinals]
		Check if any initials or ordinals tokens that are marked
		as sentbreaks should be reclassified as abbreviations.
	*/
	if tokIsInitial || typ == "##number##" {
		isSentStarter := a.Ortho.Heuristic(tokTwo)

		if isSentStarter == 0 {
			tokOne.SentBreak = false
			tokOne.Abbr = true
			return
		}

		/*
			Special heuristic for initials: if orthogrpahic
			heuristc is unknown, and next word is always
			capitalized, then mark as abbrev (eg: J. Bach).
		*/
		if isSentStarter == -1 &&
			tokIsInitial &&
			a.TokenParser.FirstUpper(tokTwo) &&
			a.OrthoContext[nextTyp]&orthoLc == 0 {

			tokOne.SentBreak = false
			tokOne.Abbr = true
			return
		}
	}
}