Skip to content
This repository was archived by the owner on Sep 16, 2023. It is now read-only.

Commit 41b66e9

Browse files
committed
feat: add options to choose maxium number to collect game streams
1 parent 692895c commit 41b66e9

File tree

4 files changed

+60
-21
lines changed

4 files changed

+60
-21
lines changed

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
long_description = fh.read()
99

1010
setup(name='twitchanal',
11-
version='0.1.0',
11+
version='0.1.1',
1212
author='yuukidach',
1313
author_email='[email protected]',
1414
long_description=long_description,

src/twitchanal/cli.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ def save_user(dir):
3333
default=True,
3434
help='Whether to use timestamp as suffix for data file.')
3535
@click.option('--num', '-n', default=251, help='Number of games to collect.')
36+
@click.option('--stream',
37+
'-s',
38+
default=100,
39+
help='Number of game streams to collect.')
3640
@click.option(
3741
'--extra/--no-extra',
3842
default=True,
@@ -42,7 +46,8 @@ def save_user(dir):
4246
@click.option('--debug/--no-debug',
4347
default=False,
4448
help='Run in debug mode or not.')
45-
def collect(dir: str, timestamp: bool, num: int, extra: bool, debug: bool):
49+
def collect(dir: str, timestamp: bool, num: int, stream: int, extra: bool,
50+
debug: bool):
4651
""" Collect data for analysis
4752
"""
4853
try:
@@ -55,7 +60,7 @@ def collect(dir: str, timestamp: bool, num: int, extra: bool, debug: bool):
5560
else:
5661
lv = logging.INFO
5762
logging.basicConfig(filename='twitchanal.log', level=lv)
58-
collect_data(dir, timestamp, num, extra)
63+
collect_data(dir, timestamp, num, stream, extra)
5964

6065

6166
cli.add_command(save_user)

src/twitchanal/collect/fetch.py

+38-11
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,16 @@ def turn_into_df(data: dict) -> pd.DataFrame:
3434

3535

3636
def fetch_twitch_data(twitch: Twitch, fn_name: str, **kwargs) -> pd.DataFrame:
37+
""" fetch data from Twitch API
38+
39+
Args:
40+
twitch (Twitch): twitchAPI object
41+
fn_name (str): function name of twitchAPI
42+
**kwargs: arguments for fn_name
43+
44+
Returns:
45+
pd.DataFrame: fetched data
46+
"""
3747
n = kwargs['first']
3848
fn = getattr(twitch, fn_name)
3949

@@ -44,8 +54,10 @@ def fetch_twitch_data(twitch: Twitch, fn_name: str, **kwargs) -> pd.DataFrame:
4454
while (n > 0):
4555
kwargs['first'] = min(100, n)
4656
n -= kwargs['first']
57+
# check if there is more pages
58+
if not data_all['pagination']:
59+
break
4760
kwargs['after'] = data_all['pagination']['cursor']
48-
4961
data_all = fn(**kwargs)
5062
data = pd.concat([data, turn_into_df(data_all)])
5163

@@ -67,31 +79,47 @@ def fetch_top_games(twitch: Twitch, n: int = 100) -> pd.DataFrame:
6779
return top_games
6880

6981

70-
def fetch_game_streams(twitch: Twitch, game_id: str) -> pd.DataFrame:
82+
def fetch_game_streams(twitch: Twitch,
83+
game_id: str,
84+
n: int = 100) -> pd.DataFrame:
7185
""" fetch game streams data from Twitch API
7286
7387
Args:
7488
twitch (Twitch): twitch api instance
7589
game_ids (str): list of game ids
90+
n (int): how many streams to fetch
7691
7792
Returns:
7893
pd.DataFrame / None: dataframe of game streams
7994
"""
80-
game_streams = twitch.get_streams(first=100, game_id=[game_id])
81-
game_streams = turn_into_df(game_streams)
95+
kwargs = {'first': n, 'game_id': [game_id]}
96+
game_streams = fetch_twitch_data(twitch, 'get_streams', **kwargs)
97+
# game_streams = twitch.get_streams(first=100, game_id=[game_id])
98+
# game_streams = turn_into_df(game_streams)
8299
# get user id to dig more data
83100
try:
84-
user_ids = game_streams['user_id'].tolist()
101+
total_user_ids = game_streams['user_id'].tolist()
102+
user_ids_num = len(total_user_ids)
103+
ephoch = user_ids_num // 100
104+
if user_ids_num % 100 != 0:
105+
ephoch += 1
85106
except:
86107
print('game_streams')
87108
cprint('Error: ' + game_id + ' data broken. Jump over it.', 'red')
88109
return None
89110
else:
90-
users_data = twitch.get_users(user_ids=user_ids)
91-
users_data = turn_into_df(users_data)
92-
# select needed columns
93-
users_data = users_data[['broadcaster_type', 'description', 'type']]
94-
game_streams = pd.concat([game_streams, users_data], axis=1)
111+
total_users_data = pd.DataFrame(columns=['broadcaster_type', 'description', 'type'])
112+
for i in range(ephoch):
113+
user_ids = total_user_ids[i*100: i*100+100]
114+
users_data = twitch.get_users(user_ids=user_ids)
115+
users_data = turn_into_df(users_data)
116+
# select needed columns
117+
users_data = users_data[['broadcaster_type', 'description', 'type']]
118+
total_users_data = total_users_data.append(users_data, ignore_index=True)
119+
120+
total_users_data.reset_index(drop=True, inplace=True)
121+
game_streams.reset_index(drop=True, inplace=True)
122+
game_streams = pd.concat([game_streams, total_users_data], axis=1)
95123
return game_streams
96124

97125

@@ -151,4 +179,3 @@ def fetch_game_info(df: pd.DataFrame) -> pd.DataFrame:
151179

152180
df = df.assign(**data_dict)
153181
return df
154-

src/twitchanal/collect/save.py

+14-7
Original file line numberDiff line numberDiff line change
@@ -29,20 +29,24 @@ def save_data_csv(folder: str, fname: str, data: pd.DataFrame) -> NoReturn:
2929
print('Finish writing', fname)
3030

3131

32-
def save_game_streams(twitch: Twitch, data_folder: str, game_id: str,
33-
fname: str) -> NoReturn:
32+
def save_game_streams(twitch: Twitch,
33+
data_folder: str,
34+
game_id: str,
35+
fname: str,
36+
n: int = 100) -> NoReturn:
3437
""" save live streams
3538
3639
Args:
3740
twitch (Twitch): twitch api class instance
3841
data_folder (str): folder to contains data
3942
game_id (str): game id
4043
fname (str): data file name
41-
44+
n (int): number of live streams to collect. Defaults to 100.
45+
4246
Returns:
4347
NoReturn
4448
"""
45-
game_streams = fetch_game_streams(twitch, game_id)
49+
game_streams = fetch_game_streams(twitch, game_id, n)
4650
if not game_streams is None:
4751
save_data_csv(data_folder, fname, game_streams)
4852

@@ -74,14 +78,16 @@ def save_n_game_streams(twitch: Twitch,
7478
twitchs = [twitch] * len
7579
data_folders = [data_folder] * len
7680
game_ids = data['id'].tolist()
81+
n = [n] * len
7782
pool = ThreadPool(10)
78-
pool.starmap(save_game_streams, zip(twitchs, data_folders, game_ids,
79-
fnames))
83+
pool.starmap(save_game_streams,
84+
zip(twitchs, data_folders, game_ids, fnames, n))
8085

8186

8287
def collect_data(data_folder: str = './dataset',
8388
with_timestamp: bool = True,
8489
num: int = 251,
90+
stream: int = 100,
8591
extra: bool = True) -> NoReturn:
8692
""" collecet data from twitch api
8793
@@ -90,6 +96,7 @@ def collect_data(data_folder: str = './dataset',
9096
with_timestamp (bool, optional): whether using a timestamp as suffix or not.
9197
Defaults to True.
9298
num (int, optional): Number of games to collect.
99+
stream (int, optional): Number of streams to collect.
93100
extra (bool, optional): Whether to collect extra info like `peek viewers`,
94101
`peek channels` and so on for top games.
95102
@@ -109,7 +116,7 @@ def collect_data(data_folder: str = './dataset',
109116
timestamp = ""
110117

111118
top_games = fetch_top_games(twitch, num)
112-
save_n_game_streams(twitch, data_folder, top_games, timestamp)
119+
save_n_game_streams(twitch, data_folder, top_games, timestamp, stream)
113120
if extra:
114121
top_games = fetch_game_info(top_games)
115122
save_data_csv(data_folder, 'top_games' + timestamp, top_games)

0 commit comments

Comments
 (0)