Skip to content

Commit

Permalink
Merge pull request #136 from xosh/benchmark_lcp
Browse files Browse the repository at this point in the history
Benchmark lcp
  • Loading branch information
simongog committed Nov 27, 2013
2 parents f18fdaf + 2af8a33 commit 9a2ed71
Show file tree
Hide file tree
Showing 16 changed files with 378 additions and 0 deletions.
11 changes: 11 additions & 0 deletions benchmark/lcp/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
*
!.gitignore
!lcp.config
!README.md
!bin/
!src/
!visualize/
!compile_options.config
!Makefile
!results/
!test_case.config
67 changes: 67 additions & 0 deletions benchmark/lcp/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
include ../../Make.helper
CFLAGS = $(MY_CXX_FLAGS)
SRC_DIR = src
BIN_DIR = bin
LIBS = -lsdsl -ldivsufsort -ldivsufsort64

C_OPTIONS:=$(call config_ids,compile_options.config)
TC_IDS:=$(call config_ids,test_case.config)
LCP_IDS:=$(call config_ids,lcp.config)


DL = ${foreach TC_ID,$(TC_IDS),$(call config_select,test_case.config,$(TC_ID),2)}

LCP_EXECS = $(foreach LCP_ID,$(LCP_IDS),$(BIN_DIR)/build_$(LCP_ID))

RES_FILES = $(foreach TC_ID,$(TC_IDS),\
results/$(TC_ID))

RESULT_FILE=results/all.txt

execs: $(BIN_DIR)/prep_sa_bwt $(LCP_EXECS)

timing: execs $(RES_FILES)
@cat $(RES_FILES) > $(RESULT_FILE)
@cd visualize;make

$(BIN_DIR)/prep_sa_bwt: $(SRC_DIR)/create_sa_bwt.cpp
@echo "Compiling prep_sa_bwt"
@$(MY_CXX) $(CFLAGS) $(C_OPTIONS) -L${SDSLLITE}/lib\
$(SRC_DIR)/create_sa_bwt.cpp -I${SDSLLITE}/include -o bin/prep_sa_bwt $(LIBS)

precalc%: test_case.config $(DL) lcp.config
$(eval TC_ID:=$(call dim,1,$*))
$(eval LCP_TEX_NAME:=$(call config_select,lcp.config,$(LCP_ID),3))
$(eval TC_TEX_NAME:=$(call config_select,test_case.config,$(TC_ID),3))
$(eval TC_PATH:=$(call config_select,test_case.config,$(TC_ID),2))
$(eval TC_SIZE:=$(shell wc -c <$(TC_PATH)))
@echo "Running test case: $(TC_ID)"
@echo "# TC_ID = $(TC_ID)" > results/$(TC_ID)
@echo "# TC_TEX_NAME = $(TC_TEX_NAME)">> results/$(TC_ID)
@echo "# TC_SIZE = $(TC_SIZE)">> results/$(TC_ID)
@$(BIN_DIR)/prep_sa_bwt $(TC_PATH) >> results/$(TC_ID)

results/%: precalc%
@$(foreach LCP_EXEC,$(LCP_EXECS),$(shell $(LCP_EXEC) >>$@;rm -f lcp_tmp.sdsl isa_tmp.sdsl))
@rm *.sdsl

$(BIN_DIR)/build_%: $(SRC_DIR)/create_lcp.cpp lcp.config
$(eval LCP_ID:=$(call dim,1,$*))
$(eval LCP_TYPE:=$(call config_select,lcp.config,$(LCP_ID),2))
@echo "Compiling build_$*"
@$(MY_CXX) $(CFLAGS) $(C_OPTIONS) -DLCP_TYPE="$(LCP_TYPE)" -DLCPID="$(LCP_ID)" -L${SDSLLITE}/lib\
$(SRC_DIR)/create_lcp.cpp -I${SDSLLITE}/include -o $@ $(LIBS)


include ../Make.download

clean-build:
@echo "Remove executables"
rm -f $(BIN_DIR)/build*
rm -f $(BIN_DIR)/prep*

clean-result:
@echo "Remove results"
rm -f results/*

cleanall: clean-build clean-result
49 changes: 49 additions & 0 deletions benchmark/lcp/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Benchmarking LCP algorithms

## Methodology

Explored dimensions:

* lcp algorithms
* test cases

## Directory structure

* [bin](./bin): Contains the executables of the project.
* [results](./results): Contains the results of the experiments.
* [src](./src): Contains the source code of the benchmark.
* [visualize](./visualize): Contains a `R`-script which generates
a report in LaTeX format.

## Prerequisites

* For the visualization you need the following software:
- [R][RPJ] with package `xtable`. You can install the
package by calling `install.packages("xtable")` in R.
- [pdflatex][LT] to generate the pdf reports.

## Usage

* `make timing` compiles the programs, downloads
the test instances, builds the LCP arrays and generates a report located at
`visualize/lcp.pdf`. The raw numbers of the timings
can be found in the `results/all.txt`.
* All created binaries and test results can be deleted
by calling `make cleanall`.

## Customization of the benchmark

The project contains several configuration files:

* [wt.config][LCPCONFIG]: Specify different LCP algorithms.
* [test_case.config][TCCONF]: Specify test instances by ID, path, LaTeX-name
for the report, and download URL.
* [compile_options.config][CCONF]: Specify compile options by option string.

Note that the benchmark will execute every combination of lcp algorithms and test cases.

[RPJ]: http://www.r-project.org/ "R"
[LT]: http://www.tug.org/applications/pdftex/ "pdflatex"
[LCPCONFIG]: ./lcp.config "lcp.config"
[TCCONF]: ./test_case.config "test_case.config"
[CCONF]: ./compile_options.config "compile_options.config"
2 changes: 2 additions & 0 deletions benchmark/lcp/bin/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
2 changes: 2 additions & 0 deletions benchmark/lcp/compile_options.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Compile options
-O3 -funroll-loops -fomit-frame-pointer -ffast-math -DNDEBUG
14 changes: 14 additions & 0 deletions benchmark/lcp/lcp.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# This file specifies wavelettrees that are used in the benchmark.
#
# Each LCP algorithm is specified by a 4-tupel: LCP_ID;LCP_ALGORITHM;LCP_LATEX_NAME;BWT_NEEDED
# * LCP_ID : An identifier for the index. Only letters and underscores are allowed in ID.
# * LCP_ALGORITHM : Corresponding lcp alogrithm.
# * LCP_LATEX_NAME: LaTeX name for output in the benchmark report.
# * BWT_NEEDED : T(rue) if lcp algorithm needs bwt as input, otherwise F(alse).
kasai;construct_lcp_kasai<8>;lcp-kasai;F
phi_algorithm;construct_lcp_PHI<8>;lcp-$\Phi$;F
semi_extern_phi;construct_lcp_semi_extern_PHI;lcp-semi-extern-$\Phi$;F
go;construct_lcp_go;lcp-go;T
goPhi;construct_lcp_goPHI;lcp-go-$\Phi$;T
bwtb;construct_lcp_bwt_based;lcp-bwt-based;T
bwtb2;construct_lcp_bwt_based2;lcp-bwt-based2;T
2 changes: 2 additions & 0 deletions benchmark/lcp/results/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
4 changes: 4 additions & 0 deletions benchmark/lcp/src/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*
!.gitignore
!create_lcp.cpp
!create_sa_bwt.cpp
34 changes: 34 additions & 0 deletions benchmark/lcp/src/create_lcp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#include <sdsl/sdsl_concepts.hpp>
#include <sdsl/int_vector.hpp>
#include <sdsl/construct.hpp>
#include <sdsl/construct_lcp.hpp>
#include <string>
#include <chrono>

using namespace sdsl;
using namespace std;
using namespace std::chrono;

#define S(x) #x
#define SX(x) S(x)

int main(int argc, char** argv)
{
memory_monitor::start();
string dir = ".";
string id = "tmp";
cache_config config(false, dir, id);

register_cache_file(conf::KEY_TEXT, config);
register_cache_file(conf::KEY_SA, config);
register_cache_file(conf::KEY_BWT, config);

auto start = high_resolution_clock::now();
LCP_TYPE(config);
auto stop = high_resolution_clock::now();
memory_monitor::stop();
cout << "# " SX(LCPID) "_TIME = " << duration_cast<milliseconds>(stop-start).count()/(double)1000 << endl;
cout << "# " SX(LCPID) "_MMPEAK = "<< memory_monitor::peak() << endl;

return 0;
}
68 changes: 68 additions & 0 deletions benchmark/lcp/src/create_sa_bwt.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include <sdsl/sdsl_concepts.hpp>
#include <sdsl/int_vector.hpp>
#include <sdsl/construct.hpp>
#include <sdsl/construct_sa.hpp>
#include <sdsl/construct_bwt.hpp>
#include <string>
#include <chrono>
#include <iostream>

using namespace sdsl;
using namespace std;
using namespace std::chrono;

typedef bit_vector::size_type size_type;

//argv[1] = test file
int main(int argc, char** argv)
{
memory_monitor::start();
string file = argv[1];
uint8_t num_bytes = 1; // Byte Alphabet
string dir = ".";
string id = "tmp";
cache_config config(false, dir, id);

//load text
auto start = high_resolution_clock::now();
{
int_vector<8> text;
load_vector_from_file(text, file, num_bytes);
if (contains_no_zero_symbol(text, file)) {
append_zero_symbol(text);
store_to_cache(text, conf::KEY_TEXT, config);
}
register_cache_file(conf::KEY_TEXT, config);
}
auto stop = high_resolution_clock::now();
memory_monitor::stop();
cout << "# TXT_TIME = " << duration_cast<milliseconds>(stop-start).count()/(double)1000 << endl;
cout << "# TXT_MMPEAK = " << memory_monitor::peak() << endl;

//construct sa
memory_monitor::start();
start = high_resolution_clock::now();
{
construct_sa<8>(config);
register_cache_file(conf::KEY_SA, config);
}
stop = high_resolution_clock::now();
memory_monitor::stop();
cout << "# SA_TIME = " << duration_cast<milliseconds>(stop-start).count()/(double)1000 << endl;
cout << "# SA_MMPEAK = " << memory_monitor::peak() << endl;

//construct bwt
memory_monitor::start();
start = high_resolution_clock::now();
{
construct_bwt<8>(config);
register_cache_file(conf::KEY_BWT, config);
}
stop = high_resolution_clock::now();
memory_monitor::stop();
cout << "# BWT_TIME = " << duration_cast<milliseconds>(stop-start).count()/(double)1000 <<endl;
cout << "# BWT_MMPEAK = "<< memory_monitor::peak() << endl;

return 0;
}

16 changes: 16 additions & 0 deletions benchmark/lcp/test_case.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Configuration for test files
# (1) Identifier for test file (consisting of letters, no `.`)
# (2) Path to the test file
# (3) LaTeX name
# (4) Download link (if the test is available online)
ENGLISH;../data/english.200MB;english.200MB;http://pizzachili.di.unipi.it/texts/nlang/english.200MB.gz
DBLPXML;../data/dblp.xml.200MB;dblp.xml.200MB;http://pizzachili.di.unipi.it/texts/xml/dblp.xml.200MB.gz
DNA;../data/dna.200MB;dna.200MB;http://pizzachili.di.unipi.it/texts/dna/dna.200MB.gz
PROTEINS;../data/proteins.200MB;proteins.200MB;http://pizzachili.di.unipi.it/texts/protein/proteins.200MB.gz
SOURCES;../data/sources.200MB;sources.200MB;http://pizzachili.di.unipi.it/texts/code/sources.200MB.gz
INFLUENZA;../data/influenza;influenza;http://pizzachili.dcc.uchile.cl/repcorpus/real/influenza.gz
EINSTEIN-de;../data/einstein.de.txt;einstein-de;http://pizzachili.dcc.uchile.cl/repcorpus/real/einstein.de.txt.gz
EINSTEIN-en;../data/einstein.en.txt;einstein-en;http://pizzachili.dcc.uchile.cl/repcorpus/real/einstein.en.txt.gz
PARA;../data/para;para;http://pizzachili.dcc.uchile.cl/repcorpus/real/para.gz
WORLDLEADER;../data/world_leaders;world-leaders;http://pizzachili.dcc.uchile.cl/repcorpus/real/world_leaders.gz
E-COLI;../data/Escherichia_Coli;E.coli;http://pizzachili.dcc.uchile.cl/repcorpus/real/Escherichia_Coli.gz
6 changes: 6 additions & 0 deletions benchmark/lcp/visualize/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
*
!.gitignore
!Makefile
!lcp-header.tex
!lcp-footer.tex
!lcp.R
17 changes: 17 additions & 0 deletions benchmark/lcp/visualize/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
include ../../../Make.helper

CONFIG_FILES= ../test_case.config

all: lcp.pdf

lcp.pdf: lcp.tex
@echo "Use pdflatex to generate lcp.pdf"
@pdflatex lcp.tex >> LaTeX.Log 2>&1

lcp.tex: ../results/all.txt ../../basic_functions.R lcp.R $(CONFIG_FILES)
@echo "Use R to generate lcp.tex"
@R --vanilla < lcp.R > R.log 2>&1

clean:
rm -f lcp.pdf lcp.aux lcp.tex fig* \
lcp.log R.log LaTeX.log
1 change: 1 addition & 0 deletions benchmark/lcp/visualize/lcp-footer.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
\end{document}
8 changes: 8 additions & 0 deletions benchmark/lcp/visualize/lcp-header.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
\documentclass[9pt,a4paper,DIV10]{scrartcl}
\usepackage{booktabs}
\usepackage{array}
\usepackage{ragged2e}

\begin{document}

\pagestyle{empty}
77 changes: 77 additions & 0 deletions benchmark/lcp/visualize/lcp.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
library(xtable)
source("../../basic_functions.R")

tex_file = "lcp.tex"

tc_config <- readConfig("../test_case.config",c("TC_ID","PATH","LATEX_NAME","URL"))
lcp_config <- readConfig("../lcp.config",c("LCP_ID","LCP_TYPE","LATEX_NAME","BWT"))


make_latex_header <- function(names){
x <- paste("&&\\multicolumn{2}{c}{", names,"}")
x <- paste(x, collapse=" ")
clines=""
for(i in 1:length(names)){
clines <- paste(clines,"\\cmidrule{",3*i,"-",3*i+1,"}",sep="")
}
y <- paste("\\toprule",x, "\\\\",clines,"\n")
gsub("_","\\\\_",y)
}

#read header
sink(tex_file)
cat(paste(readLines("lcp-header.tex"),collapse="\n"))

maindata <- data_frame_from_key_value_pairs( "../results/all.txt" )

names<-c("SA","BWT","LCP","OVERALL")
unitrow <- paste(c("", rep(c("&&Time", "&Space"), length(names)), "\\\\","", rep(c("&&(sec)", "&(\\%)"), length(names)), "\\\\[1ex]"), collapse="", sep='')

# create a table for each test case
for(i in 1:nrow(maindata)){

data<-maindata[i,]
row<-nrow(lcp_config)
size<-data[["TC_SIZE"]]
table<-data.frame(EMPTY=character(row),SATIME=character(row),SASPACE=character(row),EMPTY2=character(row),BWTTIME=character(row),BWTSPACE=character(row),EMPTY3=character(row),LCPTIME=character(row),LCPSPACE=character(row),EMPTY4=character(row),OVERALLTIME=character(row),OVERALLSPACE=character(row),stringsAsFactors=FALSE)

# gather data
for(l in 1:row){
table[l,]["SATIME"]<-sprintf("%.2f",data[["SA_TIME"]])
table[l,]["SASPACE"]<-round(data[["SA_MMPEAK"]]*100/size, digits=0)


if(lcp_config[["BWT"]][l]){
table[l,]["BWTTIME"]<-sprintf("%.2f",data[["BWT_TIME"]])
table[l,]["BWTSPACE"]<-round(data[["BWT_MMPEAK"]]*100/size, digits=0)
table[l,]["OVERALLTIME"]<-sprintf("%.2f",data[["SA_TIME"]]+data[["BWT_TIME"]]+data[[paste(lcp_config[["LCP_ID"]][l],"_TIME",sep="")]])
table[l,]["OVERALLSPACE"]<-round(max(data[["SA_MMPEAK"]],data[["BWT_MMPEAK"]],data[[paste(lcp_config[["LCP_ID"]][l],"_MMPEAK",sep="")]])*100/size, digits=0)
}
else{
table[l,]["BWTTIME"]<-"-"
table[l,]["BWTSPACE"]<-"-"
table[l,]["OVERALLTIME"]<-sprintf("%.2f",data[["SA_TIME"]]+data[[paste(lcp_config[["LCP_ID"]][l],"_TIME",sep="")]])
table[l,]["OVERALLSPACE"]<-round(max(data[["SA_MMPEAK"]],data[[paste(lcp_config[["LCP_ID"]][l],"_MMPEAK",sep="")]])*100/size, digits=0)
}

table[l,]["LCPTIME"]<-sprintf("%.2f",data[[paste(lcp_config[["LCP_ID"]][l],"_TIME",sep="")]])
table[l,]["LCPSPACE"]<-round(data[[paste(lcp_config[["LCP_ID"]][l],"_MMPEAK",sep="")]]*100/size, digits=0)
}

row.names(table)<-lcp_config[["LATEX_NAME"]]

# convert and print table
ali <- c("l", rep(c("@{\\hspace{1ex}}l","c","c"), (ncol(table))/3) )
dig <- c(0, rep(c(0,3,0),(ncol(table))/3 ))

print( xtable(table, align=ali, digits=dig,
caption = paste("Results for ",as.character(data[["TC_TEX_NAME"]])," (size: ",round(size/(1024^2), digits=3),"MB). Runtime in seconds. Space is the peak memory usage (including input and output) as fraction of original file size.")),
add.to.row=list(pos=list(-1,0,nrow(table)), command=c(make_latex_header(names),unitrow,"\\bottomrule")),
hline.after=c(),
sanitize.rownames.function = identity,
include.colnames = FALSE
)
}

cat(paste(readLines("lcp-footer.tex"),collapse="\n"))
sink(NULL)

0 comments on commit 9a2ed71

Please sign in to comment.