feat: add options to choose maxium number to collect game streams

yuukidach · yuukidach · commit 41b66e97af6b · 2020-11-14T22:51:25.000+08:00
diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
     long_description = fh.read()
 
 setup(name='twitchanal',
-      version='0.1.0',
+      version='0.1.1',
       author='yuukidach',
       author_email='chendamailbox@foxmail.com',
       long_description=long_description,
diff --git a/src/twitchanal/cli.py b/src/twitchanal/cli.py
@@ -33,6 +33,10 @@ def save_user(dir):
               default=True,
               help='Whether to use timestamp as suffix for data file.')
 @click.option('--num', '-n', default=251, help='Number of games to collect.')
+@click.option('--stream',
+              '-s',
+              default=100,
+              help='Number of game streams to collect.')
 @click.option(
     '--extra/--no-extra',
     default=True,
@@ -42,7 +46,8 @@ def save_user(dir):
 @click.option('--debug/--no-debug',
               default=False,
               help='Run in debug mode or not.')
-def collect(dir: str, timestamp: bool, num: int, extra: bool, debug: bool):
+def collect(dir: str, timestamp: bool, num: int, stream: int, extra: bool,
+            debug: bool):
     """ Collect data for analysis
     """
     try:
@@ -55,7 +60,7 @@ def collect(dir: str, timestamp: bool, num: int, extra: bool, debug: bool):
     else:
         lv = logging.INFO
     logging.basicConfig(filename='twitchanal.log', level=lv)
-    collect_data(dir, timestamp, num, extra)
+    collect_data(dir, timestamp, num, stream, extra)
 
 
 cli.add_command(save_user)
diff --git a/src/twitchanal/collect/fetch.py b/src/twitchanal/collect/fetch.py
@@ -34,6 +34,16 @@ def turn_into_df(data: dict) -> pd.DataFrame:
 
 
 def fetch_twitch_data(twitch: Twitch, fn_name: str, **kwargs) -> pd.DataFrame:
+    """ fetch data from Twitch API
+
+    Args:
+        twitch (Twitch): twitchAPI object
+        fn_name (str): function name of twitchAPI
+        **kwargs: arguments for fn_name
+
+    Returns:
+        pd.DataFrame: fetched data
+    """
     n = kwargs['first']
     fn = getattr(twitch, fn_name)
 
@@ -44,8 +54,10 @@ def fetch_twitch_data(twitch: Twitch, fn_name: str, **kwargs) -> pd.DataFrame:
     while (n > 0):
         kwargs['first'] = min(100, n)
         n -= kwargs['first']
+        # check if there is more pages
+        if not data_all['pagination']:
+            break
         kwargs['after'] = data_all['pagination']['cursor']
-
         data_all = fn(**kwargs)
         data = pd.concat([data, turn_into_df(data_all)])
 
@@ -67,31 +79,47 @@ def fetch_top_games(twitch: Twitch, n: int = 100) -> pd.DataFrame:
     return top_games
 
 
-def fetch_game_streams(twitch: Twitch, game_id: str) -> pd.DataFrame:
+def fetch_game_streams(twitch: Twitch,
+                       game_id: str,
+                       n: int = 100) -> pd.DataFrame:
     """ fetch game streams data from Twitch API
 
     Args:
         twitch (Twitch): twitch api instance
         game_ids (str): list of game ids
+        n (int): how many streams to fetch
 
     Returns:
         pd.DataFrame / None: dataframe of game streams
     """
-    game_streams = twitch.get_streams(first=100, game_id=[game_id])
-    game_streams = turn_into_df(game_streams)
+    kwargs = {'first': n, 'game_id': [game_id]}
+    game_streams = fetch_twitch_data(twitch, 'get_streams', **kwargs)
+    # game_streams = twitch.get_streams(first=100, game_id=[game_id])
+    # game_streams = turn_into_df(game_streams)
     # get user id to dig more data
     try:
-        user_ids = game_streams['user_id'].tolist()
+        total_user_ids = game_streams['user_id'].tolist()
+        user_ids_num = len(total_user_ids)
+        ephoch = user_ids_num // 100
+        if user_ids_num % 100 != 0:
+            ephoch += 1
     except:
         print('game_streams')
         cprint('Error: ' + game_id + ' data broken. Jump over it.', 'red')
         return None
     else:
-        users_data = twitch.get_users(user_ids=user_ids)
-        users_data = turn_into_df(users_data)
-        # select needed columns
-        users_data = users_data[['broadcaster_type', 'description', 'type']]
-        game_streams = pd.concat([game_streams, users_data], axis=1)
+        total_users_data = pd.DataFrame(columns=['broadcaster_type', 'description', 'type'])
+        for i in range(ephoch):
+            user_ids = total_user_ids[i*100: i*100+100]
+            users_data = twitch.get_users(user_ids=user_ids)
+            users_data = turn_into_df(users_data)
+            # select needed columns
+            users_data = users_data[['broadcaster_type', 'description', 'type']]
+            total_users_data = total_users_data.append(users_data, ignore_index=True)
+
+        total_users_data.reset_index(drop=True, inplace=True)
+        game_streams.reset_index(drop=True, inplace=True)
+        game_streams = pd.concat([game_streams, total_users_data], axis=1)
         return game_streams
 
 
@@ -151,4 +179,3 @@ def fetch_game_info(df: pd.DataFrame) -> pd.DataFrame:
 
     df = df.assign(**data_dict)
     return df
-
diff --git a/src/twitchanal/collect/save.py b/src/twitchanal/collect/save.py
@@ -29,20 +29,24 @@ def save_data_csv(folder: str, fname: str, data: pd.DataFrame) -> NoReturn:
     print('Finish writing', fname)
 
 
-def save_game_streams(twitch: Twitch, data_folder: str, game_id: str,
-                      fname: str) -> NoReturn:
+def save_game_streams(twitch: Twitch,
+                      data_folder: str,
+                      game_id: str,
+                      fname: str,
+                      n: int = 100) -> NoReturn:
     """ save live streams
 
     Args:
         twitch (Twitch): twitch api class instance
         data_folder (str): folder to contains data
         game_id (str): game id
         fname (str): data file name
-    
+        n (int): number of live streams to collect. Defaults to 100.
+
     Returns:
         NoReturn
     """
-    game_streams = fetch_game_streams(twitch, game_id)
+    game_streams = fetch_game_streams(twitch, game_id, n)
     if not game_streams is None:
         save_data_csv(data_folder, fname, game_streams)
 
@@ -74,14 +78,16 @@ def save_n_game_streams(twitch: Twitch,
     twitchs = [twitch] * len
     data_folders = [data_folder] * len
     game_ids = data['id'].tolist()
+    n = [n] * len
     pool = ThreadPool(10)
-    pool.starmap(save_game_streams, zip(twitchs, data_folders, game_ids,
-                                        fnames))
+    pool.starmap(save_game_streams,
+                 zip(twitchs, data_folders, game_ids, fnames, n))
 
 
 def collect_data(data_folder: str = './dataset',
                  with_timestamp: bool = True,
                  num: int = 251,
+                 stream: int = 100,
                  extra: bool = True) -> NoReturn:
     """ collecet data from twitch api
 
@@ -90,6 +96,7 @@ def collect_data(data_folder: str = './dataset',
         with_timestamp (bool, optional): whether using a timestamp as suffix or not. 
                                          Defaults to True.
         num (int, optional): Number of games to collect.
+        stream (int, optional): Number of streams to collect.
         extra (bool, optional): Whether to collect extra info like `peek viewers`, 
                                 `peek channels` and so on for top games.
     
@@ -109,7 +116,7 @@ def collect_data(data_folder: str = './dataset',
         timestamp = ""
 
     top_games = fetch_top_games(twitch, num)
-    save_n_game_streams(twitch, data_folder, top_games, timestamp)
+    save_n_game_streams(twitch, data_folder, top_games, timestamp, stream)
     if extra:
         top_games = fetch_game_info(top_games)
     save_data_csv(data_folder, 'top_games' + timestamp, top_games)