Skip to content

Commit

Permalink
Add date filter to background linking reranker (#786)
Browse files Browse the repository at this point in the history
  • Loading branch information
chriskamphuis authored and lintool committed Aug 29, 2019
1 parent f690b5b commit c5ee9af
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 9 deletions.
3 changes: 2 additions & 1 deletion src/main/java/io/anserini/index/generator/WapoGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public class WapoGenerator extends LuceneDocumentGenerator<WashingtonPostCollect
public static final String FIELD_RAW = "raw";
public static final String FIELD_BODY = "contents";
public static final String FIELD_ID = "id";

private static final String PATTERN = "<.+>";
public static final List<String> CONTENT_TYPE_TAG = Arrays.asList("sanitized_html", "tweet");

Expand Down Expand Up @@ -86,6 +86,7 @@ public Document createDocument(WashingtonPostCollection.Document wapoDoc) {
// This is needed to break score ties by docid.
doc.add(new SortedDocValuesField(FIELD_ID, new BytesRef(id)));
doc.add(new LongPoint(WapoField.PUBLISHED_DATE.name, wapoDoc.getPublishDate()));
doc.add(new StoredField(WapoField.PUBLISHED_DATE.name, wapoDoc.getPublishDate()));
wapoDoc.getAuthor().ifPresent(author -> {
doc.add(new StringField(WapoField.AUTHOR.name, author, Field.Store.NO));
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

import static io.anserini.index.generator.LuceneDocumentGenerator.FIELD_BODY;
import static io.anserini.index.generator.LuceneDocumentGenerator.FIELD_ID;
import static io.anserini.index.generator.WapoGenerator.WapoField.PUBLISHED_DATE;

/*
* TREC News Track Background Linking task postprocessing.
Expand All @@ -54,29 +55,44 @@ public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
}

// remove the duplicates: 1. the same doc with the query doc 2. duplicated docs in the results
Set<Integer> duplicates = new HashSet<>();
Set<Integer> toRemove = new HashSet<>();
for (int i = 0; i < docs.documents.length; i++) {
if (duplicates.contains(i)) continue;
String docid = docs.documents[i].getField(FIELD_ID).stringValue();
if (toRemove.contains(i)) continue;
if (computeCosineSimilarity(queryTermsMap, docsVectorsMap.get(i)) >= 0.9) {
duplicates.add(i);
toRemove.add(i);
continue;
}
for (int j = i+1; j < docs.documents.length; j++) {
if (computeCosineSimilarity(docsVectorsMap.get(i), docsVectorsMap.get(j)) >= 0.9) {
duplicates.add(j);
toRemove.add(j);
}
}
}

if (context.getSearchArgs().backgroundlinking_datefilter) {
try {
int luceneId = NewsBackgroundLinkingTopicReader.convertDocidToLuceneDocid(reader, queryDocId);
Document queryDoc = reader.document(luceneId);
long queryDocDate = Long.parseLong(queryDoc.getField(PUBLISHED_DATE.name).stringValue());
for (int i = 0; i < docs.documents.length; i++) {
long date = Long.parseLong(docs.documents[i].getField(PUBLISHED_DATE.name).stringValue());
if (date > queryDocDate) {
toRemove.add(i);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}

ScoredDocuments scoredDocs = new ScoredDocuments();
int resSize = docs.documents.length - duplicates.size();
int resSize = docs.documents.length - toRemove.size();
scoredDocs.documents = new Document[resSize];
scoredDocs.ids = new int[resSize];
scoredDocs.scores = new float[resSize];
int idx = 0;
for (int i = 0; i < docs.documents.length; i++) {
if (!duplicates.contains(i)) {
if (!toRemove.contains(i)) {
scoredDocs.documents[idx] = docs.documents[i];
scoredDocs.scores[idx] = docs.scores[i];
scoredDocs.ids[idx] = docs.ids[i];
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/io/anserini/search/SearchArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,11 @@ public class SearchArgs {
@Option(name = "-backgroundlinking.weighted", usage = "Boolean switch to construct boosted query for TREC News Track Background " +
"Linking task. The terms scores are their tf-idf score from the query document")
public boolean backgroundlinking_weighted = false;


@Option(name = "-backgroundlinking.datefilter", usage = "Boolean switch to filter out articles published after topic article " +
"for the TREC News Track Background Linking task.")
public boolean backgroundlinking_datefilter = false;

@Option(name = "-stemmer", usage = "Stemmer: one of the following porter,krovetz,none. Default porter")
public String stemmer = "porter";

Expand Down

0 comments on commit c5ee9af

Please sign in to comment.