Skip to content

Commit

Permalink
More work
Browse files Browse the repository at this point in the history
  • Loading branch information
justinlittman committed Aug 27, 2024
1 parent c34bf47 commit 54f058d
Showing 1 changed file with 21 additions and 2 deletions.
23 changes: 21 additions & 2 deletions lib/tasks/dataset.rake
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,23 @@ task dataset: :environment do
# * Acquisitions Serials (3)
# * Hopkins Marine Station Collection (5)
# * Stanford Rare Books and Early Manuscripts (6)
# * Dept. of Special Collections, Manuscripts Collections - Supplemental Materials (7)
# * Archive of Recorded Sound/Music Library - Supplemental Materials (8)
# * Water in the West Reports and Working Papers (40)
# * Stanford Center for Ocean Solutions Reports and Working Papers (55)
# * Undergraduate Theses, Department of Physics (70)
# * Oakland Police Department Race Relations (81)
# * John A. Blume Earthquake Engineering Center Technical Report Series (82)
# * Undergraduate Honors Theses, Graduate School of Education (102)
# * Boothe Prize Winners, Stanford University (116)
# * Stanford Iran 2040 Project (125)
# * Electronic Acquisitions (143)
# * The Korea Program Prize for Writing in Korean Studies (145)
# * Free EEMs (144)
# * Rigler and Deutsch Record Index project at Stanford, revisited (155)
# * Stanford Research Data (168)
# * Publications and flyers by the Red Guard and mass organizations in Guangdong and other cities (218)
# * Xu Liangying papers (293)

sql = <<~SQL.squish
SELECT pdf_blobs.id
Expand All @@ -25,7 +37,7 @@ task dataset: :environment do
INNER JOIN active_storage_attachments ON active_storage_attachments.record_id=attached_files.id
INNER JOIN active_storage_blobs ON active_storage_attachments.blob_id=active_storage_blobs.id
WHERE active_storage_blobs.content_type = 'application/pdf'
AND works.collection_id NOT IN (2,3,5,6,8,82,144,155,218)
AND works.collection_id NOT IN (2,3,5,6,7,8,40,55,70,81,82,102,116,125,143,144,145,155,168,218,293)
AND work_versions.work_type='text'
AND work_versions.subtype='{Article}'
GROUP BY works.id
Expand All @@ -50,8 +62,9 @@ task dataset: :environment do
FileUtils.rm_rf('dataset')
FileUtils.mkdir_p('dataset')

rows = nil
File.open('dataset/metadata.jsonl', 'w') do |metadata_file|
work_ids.each do |work_id|
rows = work_ids.map do |work_id|
puts work_id
work = Work.find(work_id)
work_version = work.head
Expand All @@ -70,8 +83,14 @@ task dataset: :environment do
keywords: work_version.keywords.map(&:label)
}
metadata_file.write("#{metadata.to_json}\n")
[work_id, work.druid, pdf_filename, work_version.title, work.collection.head.name]
end
end

CSV.open('dataset/dataset.csv', 'w') do |csv|
csv << ['Work ID', 'Druid', 'PDF Filename', 'Title', 'Collection']
rows.shuffle.each { |row| csv << row }
end
end

def map_contributor(contributor)
Expand Down

0 comments on commit 54f058d

Please sign in to comment.