-
Notifications
You must be signed in to change notification settings - Fork 0
/
Experiment_II.R
95 lines (77 loc) · 2.72 KB
/
Experiment_II.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
##################################################################
# EXPERIMENT II
# functions where they don't belong
# -> villainy with return, acquisition of magical agent with wedding, liquidation of lack with lack
# -> randomized samples vs. 'held out' data from markup
#
# what results will look like:
#
# VillainySample1 VillainySample2 VillainySample3 etc.
# VillainyLM .8
# LackLM .3
# ...
# WeddingLM 0
# EpilogueLM 0
#
# ... do the same with held out LackSamples ... WeddingSamples, EpilogueSamples
#
# PURPOSE: show how language model from PftML can be useful even
# even if it is imperfect, rough-hewn
#
##################################################################
library(sqldf)
library(data.table)
pftmlObjects <- c(dtm.villainy, dtm.lack, dtm.cardinal, etc.)
for (counter in 1:pftmlObjects) {
for (index in 1:pftmlObjects) {
getScore(pftmlObjects[index], pftmlObjects[counter])
# insert score in sqllite3 db using sqldf
# print out results
}
}
# useful stuff
# system('cp * Lack Experiment')
# sqldf
# data.table
# random
# set.seed
sample sqldf:
library(sqldf)
library(data.table)
t_twelve <- dput(t12ts)
#t12 <- as.data.table(t12ts)
t12 <- as.data.table(cbind(tx, tdt))
t12
tbill_data <- as.data.table(cbind(tbill_x, tbill_dt))
tbill_data
# do not run this - this is a cartesian join - it will hang your machine
#sqldf("
# SELECT *
# FROM t12 as t12, tbill_data as b, t12 as t12_plus1
# ")
#print the current and the day before the next Fed statement
sqldf("SELECT t12_1.tdt, t12_2.tdt-1
FROM t12 as t12_1,
t12 as t12_2
WHERE t12_2.tdt > t12_1.tdt
AND t12_2.tdt = (SELECT MIN(t12_3.tdt) AS tdt
FROM t12 as t12_3
WHERE t12_3.tdt > t12_1.tdt)")
#dot product calculation example:
library("lsa")
# load training texts
training_matrix = textmatrix("/home/kingfish/proppian_function_language_models/Lack")
# calculate tfidf
training_matrix = lw_bintf(training_matrix) * gw_idf(training_matrix) #weighting
lsa_space = lsa(training_matrix) # create LSA space
# fold-in test and gold standard snippets
test_gold_matrix = textmatrix("/home/kingfish/proppian_function_language_models/Villainy", vocabulary=rownames(training_matrix))
test_gold_matrix = lw_bintf(test_gold_matrix) * gw_idf(test_gold_matrix) #weighting
# set NULLs to zeroes
test_gold_matrix[is.na(test_gold_matrix)] <- 0
test_gold_matrix_space = fold_in(test_gold_matrix, lsa_space)
# score snippet against gold standard
# remove subscripts for comparison matrix of all texts
cor(test_gold_matrix_space, test_gold_matrix_space)
#calculate mean correlation score of test snippet against training and gold snippets
mean(cor(test_gold_matrix_space, test_gold_matrix_space[,"Wedding2.txt"]))