-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjustfile
222 lines (186 loc) · 8.54 KB
/
justfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# The MIT License (MIT)
#
# Copyright (c) 2024 Aliaksei Bialiauski
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Install dependencies.
install:
${RULTOR:+sudo} npm install -g [email protected]
python -m pip install flake8
poetry self add 'poethepoet[poetry_plugin]'
cd sr-data && poetry install
cd sr-train && poetry install
# Full build.
full tests="fast or deep":
poetry install
just test "{{tests}}"
just check
# Run tests.
test which="fast" cov="":
poetry run pytest -m "{{which}}" {{cov}}
# Check quality of source code.
check:
flake8 sr-data sr-train sr-filter
# Run experiment.
@experiment:
NOW=$(date +%F):$(TZ=UTC date +%T) && echo "$NOW" >> now.txt; \
echo Experiment datetime is: "$NOW (UTC)"
mkdir -p sr-data/experiment
mv now.txt sr-data/experiment/now.txt
just collect
just filter experiment/repos.csv
just extract experiment/after-filter.csv
just embed experiment/after-extract.csv
just datasets
just cluster
# SR-data pipeline for the experiments.
pipeline steps pipes out representation="resources/pipeline.json":
cd sr-data && poetry poe pipeline --representation {{representation}} \
--steps {{steps}} --pipes "{{pipes}}" --out "{{out}}"
# Clean up experiment.
clean:
echo "Cleaning up sr-data/experiment..."
rm sr-data/experiment/* && rmdir sr-data/experiment
# Collect repositories.
collect dir query start end out:
mkdir -p {{dir}}
ghminer --query "{{query}}" --start "{{start}}" --end "{{end}}" \
--tokens "$PATS" --filename "{{out}}" --graphql "sr-data/resources/ghminer.graphql" \
--schema "sr-data/resources/ghminer.json"
# Fetch pulls count for collected repos.
pulls repos token out="experiment/with-pulls.csv":
cd sr-data && poetry poe pulls --repos "{{repos}}" --token "{{token}}" \
--out "{{out}}"
# Collect maven pom.xml files.
maven repos token out="experiment/with-maven.csv":
cd sr-data && poetry poe maven --repos "{{repos}}" --out "{{out}}" \
--token "{{token}}"
# Count of JUnit tests.
junit repos token out="experiment/after-junit.csv":
cd sr-data && poetry poe junit --repos "{{repos}}" --out "{{out}}" \
--token "{{token}}"
# Collect test repositories.
test-collect:
mkdir -p sr-data/tmp
cd sr-data && ghminer --query "stars:>10 language:java size:>=20 mirror:false template:false" \
--start "2024-05-01" --end "2024-05-01" --tokens "$PATS" \
--filename "tmp/test-repos"
# License filter.
license_filter repos out="experiment/after-license-filter.csv":
cd sr-data && poetry poe license_filter --repos "{{repos}}" --out "{{out}}"
# Filter collected repositories.
filter repos filtered out="experiment/after-filter.csv":
cd sr-data && poetry poe filter --repos "{{repos}}" --out "{{out}}" --filtered "{{filtered}}"
# Extract headings from README files.
extract repos out="experiment/after-extract.csv":
cd sr-data && poetry poe extract --repos "{{repos}}" --out "{{out}}"
# Collect most common words.
mcw repos out="experiment/after-mcw.csv":
cd sr-data && poetry poe mcw --repos "{{repos}}" --out "{{out}}"
# Special words count.
swc repos out="experiment/after-swc.csv" config="resources/swc-words.txt":
cd sr-data && poetry poe swc --repos "{{repos}}" --out "{{out}}" \
--config "{{config}}"
# Calculate length metrics.
lens repos out="experiment/after-lens.csv":
cd sr-data && poetry poe lens --repos {{repos}} --out {{out}}
# Count snippets.
snippets repos out="experiment/after-snippets.csv":
cd sr-data && poetry poe snippets --repos {{repos}} --out {{out}}
# Count links.
links repos out="experiment/after-links.csv":
cd sr-data && poetry poe links --repos {{repos}} --out {{out}}
# GitHub pull requests and issues mentions
ghmentions repos out="experiment/after-ghmentions.csv":
cd sr-data && poetry poe ghmentions --repos {{repos}} --out {{out}}
# Run sentiment analysis on READMEs
sentiments repos out="experiment/after-sentiments.csv":
cd sr-data && poetry poe sentiments --repos {{repos}} --out {{out}}
# GitHub workflows info.
workflows repos out="experiment/after-workflows.csv":
cd sr-data && poetry poe workflows --repos {{repos}} --out {{out}}
# Compose all found metadata into final CSV.
final latest out="experiment/final.csv":
cd sr-data && poetry poe final --latest {{latest}} --out {{out}}
# Create embeddings.
embed repos prefix="experiment/embeddings":
cd sr-data && poetry poe embed --repos "{{repos}}" --prefix {{prefix}} \
--hf "$HF_TOKEN" --cohere "$COHERE_TOKEN"
# Create datasets.
# @todo #134:35min: Remove run ad-hoc solution for just command resolution.
# Now, we passing run parameter from recipe to nested just invocations in
# order to resolve just command. We should refine our usage of full path in
# the entire justfile.
# @todo #134:45min: Refactor recipes to be more optimally granular.
# We should create more major recipes in order to reuse across the project.
# The example of such step is `@experiment`. Let's do similar to the script
# inside `data.sh`, so it can be invoked from just using datasets step.
datasets embeddings run dir="experiment" numbase="after-extract.csv":
"{{run}}"/just numerical "{{dir}}/{{numbase}}" "{{dir}}/numerical.csv"
"{{run}}"/just scores "{{dir}}/{{numbase}}" "{{dir}}/scores.csv"
"{{run}}"/just combine "{{run}}" "{{embeddings}}" "{{dir}}"
# Combine datasets with embeddings.
combine run embeddings dir:
if "{{embeddings}}"; then \
"{{run}}"/just combination "{{dir}}" 5 "{{dir}}/sbert.csv" "{{dir}}/scores.csv"; \
fi
if "{{embeddings}}"; then \
"{{run}}"/just combination "{{dir}}" 6 "{{dir}}/e5.csv" "{{dir}}/scores.csv"; \
fi
if "{{embeddings}}"; then \
"{{run}}"/just combination "{{dir}}" 7 "{{dir}}/embedv3.csv" "{{dir}}/scores.csv"; \
fi
# Create scores dataset.
scores repos out="experiment/scores.csv":
cd sr-data && poetry poe scores --repos "{{repos}}" --out "{{out}}"
# Create all numerical dataset.
numerical repos out="experiment/numerical.csv":
cd sr-data && poetry poe numerical --repos "{{repos}}" --out "{{out}}"
# Create dataset from combination.
combination dir identifier embeddings scores="experiment/scores.csv":
cd sr-data && poetry poe combination --scores "{{scores}}" \
--embeddings "{{embeddings}}" --dir "{{dir}}" --identifier "{{identifier}}"
# Merge dataset folders into one.
merge datasets out branch:
cd sr-data && poetry poe merge --datasets "{{datasets}}" --out "{{out}}" \
--branch "{{branch}}"
# Download datasets.
dataset folder out:
cd sr-train && poetry poe dataset --folder "{{folder}}" --out "{{out}}"
# Cluster repositories.
cluster dir out:
cd sr-train && poetry poe cluster --dataset "{{dir}}/d0-numerical.csv" --dir {{out}}
cd sr-train && poetry poe cluster --dataset "{{dir}}/d1-scores.csv" --dir {{out}}
cd sr-train && poetry poe cluster --dataset "{{dir}}/d2-sbert.csv" --dir {{out}}
cd sr-train && poetry poe cluster --dataset "{{dir}}/d3-e5.csv" --dir {{out}}
cd sr-train && poetry poe cluster --dataset "{{dir}}/d4-embedv3.csv" --dir {{out}}
cd sr-train && poetry poe cluster --dataset "{{dir}}/d5-scores+sbert.csv" --dir {{out}}
cd sr-train && poetry poe cluster --dataset "{{dir}}/d6-scores+e5.csv" --dir {{out}}
cd sr-train && poetry poe cluster --dataset "{{dir}}/d7-scores+embedv3.csv" --dir {{out}}
# Statistics about generated clusters.
clusterstat out dir="experiment":
cd sr-train && poetry poe clusterstat --dir {{dir}} --out {{out}}
# Remove readme from CSV file.
# Can be useful for inspecting large files.
noreadme repos out="experiment/no-readme.csv":
cd sr-data && poetry poe no_readme --repos {{repos}} --out {{out}}
# Build paper with LaTeX.
paper:
latexmk --version
cd sr-paper && latexmk paper.tex -pdf