tmp.properties.hoffmann

#
# Global properties file for the KBP project
#
kbp.runid = mimlre_small_newAlgo
serializedRelationExtractorPath = kbp_relation_model_mimlre_small
work.dir = small_dataset/corpora
trainer.model = atleastonce

#
# BaselineNLProcessor settings
#
# annotators is used for test queries in KBPTester and SnippetsToCache. Do NOT mess with this property!
annotators = tokenize, ssplit, pos, lemma, ner, regexner, parse, dcoref
ssplit.htmlBoundariesToDiscard = p,text,post,postdate,poster,turn,speaker,quote
parser.maxlen = 50
regexner.mapping = resources/kbp/regexner/combined_map
regexner.ignorecase = true
oldCorefFormat = true

#
# KBPReader properties
#

# new indices over NFS, using our own customized annotation caching
index.kbp = small_dataset/index_dir
index.maxsentencelength = 50

# could be NONE, NER, TRIGGER, or BUCKETS (see IndexExtractor.java for more details)
index.train.sortmode = NONE
index.train.useknownslots = true
index.test.sortmode = BUCKETS

index.train.sentences.per.entity = 50
index.test.sentences.per.entity = 50
reader.useweb = false
index.train.usecache = true
index.test.usecache = true
index.cache.dir = small_dataset/index_cache

# number of sentences before and after the current to return as part of a match
index.context.previous = 0
index.context.next = 0

# How many extra results to consider when picking sentences.per.entity
index.extraresults.factor = 1.0
# same thing, but applied at the IndexAndWebCacheSentenceExtractor,
# which uses different sorting methods to pick the best ones
index.indexandweb.extraresults.factor = 1.0
# sort modes after combining all sentences... either sort index & web
# together or sort them separately
# see IndexAndWebCacheSentenceExtractor
index.combinemode = NO_SORTING

# we might use these trigger words during feature generation (disabled by default)
relation.triggers = resources/kbp/web_queries/keywords_no_ml
reader.multimatch = true
reader.enforcene = true
reader.domain.adapt = false
# possible values: all, three, two
# - all: makes one domain for each index 
# - three: corpus, web, wiki
# - two: web, non-web
reader.domain.adapt.style = all

#
# block below relevant only for PipelineIndexExtractor, which should no longer be used (everything should be cached offline)
#
index.pipelinemethod = SPLIT
index.fullannotators = tokenize, ssplit, pos, lemma, ner, regexner, parse, dcoref
index.step1annotators = tokenize, ssplit
index.step2annotators = pos, lemma, ner, regexner, parse

overlapping.relations = resources/kbp/overlaps.tab

#
# KBPDomReader settings
#
kbp.mapping = small_dataset/resources/mapping
kbp.ner.types = small_dataset/resources/NER_types
kbp.manual.lists = small_dataset/resources/specific_relations
kbp.countries = small_dataset/resources/countries
kbp.states = small_dataset/resources/statesandprovinces
kbp.inputkb = small_dataset/input_kb/data
kbp.debugkb = small_dataset/


#
# Gazetteer info
#
nationalities = small_dataset/resources/CountryLexicalResource.db
states = small_dataset/resources/state-abbreviations.txt


#
# SemgrexExtractor
#
rule.dir = /NOT_RELEASED/u/nlp/data/TAC-KBP2010/patterns
priority.file = /NOT_RELEASED/u/nlp/data/TAC-KBP2010/relation.priorities
use.statistical.model = true
use.rulebased.model = false


#
# ErrorAnalysis
#
## don't need NLP analysis for the non-cached indices
## do NOT set this in regular train/test runs!
#index.minimal.analysis = true
# if true, run the analysis over devQueries; otherwise, run it over the training KB set in analysis.kb
analysis.test.mode = false
# small.xml contains ~100 entities; big.xml contains ~1000 entities
analysis.kb = /NOT_RELEASED/u/nlp/data/TAC-KBP2010/TAC_2009_KBP_Evaluation_Reference_Knowledge_Base/stanford_splits/analysis/big.xml
# use this property if you want a serialized map from relation name to coremap
#analysis.dumpSentences = /u/horatio/kbpsentences2.ser


#
# Model properties
#
# do not tune the acceptance threshold for a slot; accept everything
slot.threshold = 0
# keep only 5% of the negative examples; this is the best for KBP
negatives.sampleratio = 0.05
# use all negative labels; this is better when we aggresively subsample negatives
use.allnegs = true
# remove features seen less than 5 times; this makes everything much faster
featureCountThreshold = 5

# JointBayes settings (tuned on dev)
folds = 3
epochs = 8
features = 0
filter = all
inference.type = stable

# AtLeastOnce settings (tuned on dev)
perceptron.epochs = 2
inference.epochs = 1
algo.type = 1

#
# KBPTrainer/Tester settings
#
#nlpsub = true
relationFeatures = arg_words,arg_type,arg_order,full_tree_path,surface_distance_binary,surface_distance_bins,adjacent_words,entities_between_args,entity_counts_binary,entity_counts_bins,span_words_unigrams,dependency_path_lowlevel,dependency_path_words
# valid values: best, all
kbp.list.output = all
logLevel = SEVERE
readerLogLevel = SEVERE
#trainPath = resources/kbp/TAC_2009_KBP_Evaluation_Reference_Knowledge_Base/data
#testPath = /NOT_RELEASED/u/nlp/data/TAC-KBP2010/TAC_2009_KBP_Evaluation_Reference_Knowledge_Base/stanford_splits/devel

# test queries from the combined set of 2010 and 2011 queries
devQueries = resources/kbp/test_combined/TAC_KBP_Regular-Slot_Queries_DEVELOPMENT.xml
testQueries = resources/kbp/test_combined/TAC_KBP_Regular-Slot_Queries_TESTING.xml
kbp.goldresponses = resources/kbp/test_combined/TAC_KBP_Regular-Slot_Assessments

kbScoreFile = kb_score.txt
queryScoreFile = query_score

inference.during.tuning = false
doc.finding.during.tuning = true

# model combination properties
model.combination.enabled = false