-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranscribe_aws.py
78 lines (68 loc) · 2.38 KB
/
transcribe_aws.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import time
import boto3
import datetime
import re
import requests
import json
import pickle
import urllib
import sys
import os
dt_now = datetime.datetime.now()
dt_now_f = dt_now.strftime("%Y%m%d-%H%M%S")
args = sys.argv
file_name = args[1]
file_b_name = os.path.splitext(args[1])[0]
transcribe = boto3.client('transcribe')
job_name = dt_now_f
job_uri = "https://s3-ap-northeast-1.amazonaws.com/ysdyt-audios/{}".format(file_name)
transcribe.start_transcription_job(
TranscriptionJobName=job_name,
Media={'MediaFileUri': job_uri},
MediaFormat='mp3',
LanguageCode='ja-JP'
)
while True:
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
break
print("Not ready yet...")
time.sleep(5)
print(status)
# Save the transcribed result =================
full_uri = status['TranscriptionJob']['Transcript']['TranscriptFileUri']
uri = re.search('https.+\.json', full_uri).group()
amz_stoken = re.search('X-Amz-Security-Token=(.+)&X-Amz-Algorithm', full_uri).group(1)
amz_al = re.search('X-Amz-Algorithm=(.+)&X-Amz-Date', full_uri).group(1)
amz_date = re.search('X-Amz-Date=(.+)&X-Amz-SignedHeaders', full_uri).group(1)
amz_header = re.search('X-Amz-SignedHeaders=(.+)&X-Amz-Expires', full_uri).group(1)
amz_expire = re.search('X-Amz-Expires=(.+)&X-Amz-Credential', full_uri).group(1)
amz_creden = re.search('X-Amz-Credential=(.+)&X-Amz-Signature', full_uri).group(1)
amz_sign = re.search('X-Amz-Signature=(.+)', full_uri).group(1)
#print(uri)
#print(amz_stoken)
#print(urllib.parse.unquote(amz_stoken))
#print(amz_al)
#print(amz_date)
#print(amz_header)
#print(amz_expire)
#print(amz_creden)
#print(urllib.parse.unquote(amz_creden))
#print(amz_sign)
params = (
('X-Amz-Security-Token', urllib.parse.unquote(amz_stoken)), #decode
('X-Amz-Algorithm', amz_al),
('X-Amz-Date', amz_date),
('X-Amz-SignedHeaders', amz_header),
('X-Amz-Expires', amz_expire),
('X-Amz-Credential', urllib.parse.unquote(amz_creden)), #decode
('X-Amz-Signature', amz_sign),
)
response = requests.get(uri, params=params)
#print(response)
transcribed_result = response.json()
#print(transcribed_result)
new_dir_path = './transcribed_file'
os.makedirs(new_dir_path, exist_ok=True)
with open('./transcribed_file/{}.pickle'.format(file_b_name), 'wb') as f:
pickle.dump(transcribed_result, f)