-
Notifications
You must be signed in to change notification settings - Fork 3
/
build_indri_index_challenge.py
120 lines (90 loc) · 4.28 KB
/
build_indri_index_challenge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pickle
import os
from utils import write_index, entry
from utils_text import title_plus_bigrams, normalize_name_title
# IndPidAsDocTracksAsTerms.txt
# get title bipartite
bipartite_path = 'data/bipartite_challenge'
index_path = 'data/indexes_challenge'
PidTitleBipartite = pickle.load(
open(os.sep.join((bipartite_path, 'AllDataPidTitleBipartite.pkl')), 'rb'))
PidTrackListBipartite = pickle.load(
open(os.sep.join((bipartite_path, 'AllDataPidTrackListBipartite.pkl')), 'rb'))
# buildIndexPidAsDocTracksAsTerms
index_name = '1MIndexPidAsDocTracksAsTerms.txt'
pidDocs = []
for pid in PidTitleBipartite:
trackList = PidTrackListBipartite[pid]
pidDocs.append(entry(str(pid), ' '.join(
[item.replace('spotify:track:', '') for item in trackList])))
write_index(index_path, index_name, pidDocs)
# IndAlbumAsDocNormTracksAsTerms
AllDataAlbumTrackSetBipartite = pickle.load(
open(os.sep.join((bipartite_path, 'AllDataAlbumTrackSetBipartite.pkl')), 'rb'))
index_name_norm = '1MIndexAlbumAsDocNormTracksListAsTerms.txt'
albumDocsNorm = []
albumDocsNonNorm = []
for albumid, tracks in AllDataAlbumTrackSetBipartite.items():
normTracks = list(set(tracks))
albumDocsNonNorm.append(entry(albumid.strip(), ' '.join(
list([item.replace('spotify:track:', '') for item in tracks]))))
albumDocsNorm.append(entry(albumid.strip(), ' '.join(
list([item.replace('spotify:track:', '') for item in normTracks]))))
write_index(index_path, index_name_norm, albumDocsNorm)
# IndTitsAsDocNormTracksAsTerms.txt
AllDataArtistTrackSetBipartite = pickle.load(
open(os.sep.join((bipartite_path, 'AllDataArtistTrackSetBipartite.pkl')), 'rb'))
index_name_norm = '1MIndexArtistsAsDocNormTracksSetAsTerms.txt'
artistDocsNorm = []
artistDocsNonNorm = []
for albumid, tracks in AllDataArtistTrackSetBipartite.items():
normTracks = list(set(tracks))
artistDocsNonNorm.append(entry(albumid.strip(), ' '.join(
list([item.replace('spotify:track:', '') for item in tracks]))))
artistDocsNorm.append(entry(albumid.strip(), ' '.join(
list([item.replace('spotify:track:', '') for item in normTracks]))))
write_index(index_path, index_name_norm, artistDocsNorm)
# normalized text stuff
TitleTrackId = pickle.load(
open(os.sep.join((bipartite_path, 'TitleTrackId.pkl')), 'rb'))
index_name_norm = '1MIndexTitlesAsDocNormTracksSetAsTerms.txt'
index_name = '1MIndexTitlesAsDocNonNormTracksListAsTerms.txt'
titleDocsNorm = []
titletDocsNonNorm = []
for title, tracks in TitleTrackId.items():
normTracks = list(set(tracks))
titletDocsNonNorm.append(entry(title.strip(), ' '.join(
list([item.replace('spotify:track:', '') for item in tracks]))))
titleDocsNorm.append(entry(title.strip(), ' '.join(
list([item.replace('spotify:track:', '') for item in normTracks]))))
write_index(index_path, index_name_norm, titleDocsNorm)
write_index(index_path, index_name, titletDocsNonNorm)
TrackIdTitle = pickle.load(
open(os.sep.join((bipartite_path, 'TrackIdTitle.pkl')), 'rb'))
index_name = '1MIndexTracksAsDocTitlesAsTerms.txt'
trackTitleDocs = []
for trackId, titleList in TrackIdTitle.items():
truncTrackId = trackId.replace('spotify:track:', '')
concatTitle = ''
for title in titleList:
concatTitle = concatTitle + ' ' + title_plus_bigrams(title)
trackTitleDocs.append(entry(truncTrackId.strip(), concatTitle))
write_index(index_path, index_name, trackTitleDocs)
index_name = '1MIndexTracksAsDocMeta2AsTerms.txt'
TrackIdTrackName = pickle.load(
open(os.sep.join((bipartite_path, 'TrackIdTrackName.pkl')), 'rb'))
TrackIdAbumName = pickle.load(
open(os.sep.join((bipartite_path, 'TrackIdAbumName.pkl')), 'rb'))
TrackIdArtistName = pickle.load(
open(os.sep.join((bipartite_path, 'TrackIdArtistName.pkl')), 'rb'))
meta2trackDocs = []
for trackId, trackname in TrackIdTrackName.items():
truncTrackId = trackId.replace('spotify:track:', '')
normTrackName = normalize_name_title(trackname)
normAlbumName = normalize_name_title(TrackIdAbumName[trackId])
normArtistName = normalize_name_title(TrackIdArtistName[trackId])
meta2trackDocs.append(
entry(truncTrackId, normTrackName + ' ' + normAlbumName + ' ' + normArtistName))
write_index(index_path, index_name, meta2trackDocs)