From 70107868f23cf60e32ea4282db4810cd6137d654 Mon Sep 17 00:00:00 2001 From: Clinton Boys Date: Wed, 18 Mar 2015 15:01:38 +0200 Subject: [PATCH] committing v 1.0 --- .DS_Store | Bin 0 -> 6148 bytes README.md | 0 gt_scraper.py | 256 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 256 insertions(+) create mode 100644 .DS_Store create mode 100644 README.md create mode 100644 gt_scraper.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 1: + queries_list = [] + for i in range(0,len(keyword)): + queries_list.append(keyword[i]) + queries = '%20'.join(queries_list) + + date = '&date='+str(startmonth)+'%2F'+str(year)+'%202m' + + URL = URL_start+queries+date+URL_end + + webbrowser.open(URL) + +# ScrapeRange will download all files for the relevant months. +# +# WARNING: Before you run ScrapeRange, ensure your /Downloads folder +# does not contain any .csv files. + +def ScrapeRange(keyword, startmonth, startyear, endmonth, endyear): + + for i in range(startmonth,13,2): + ScrapeTwoMonths(keyword,startyear,i) + time.sleep(7) + for y in range(startyear + 1, endyear): + for i in range(1,11,2): + ScrapeTwoMonths(keyword,y,i) + time.sleep(7) + for i in range(1,endmonth,2): + ScrapeTwoMonths(keyword,endyear,i) + time.sleep(7) + + files = copy.deepcopy(os.listdir(path)) + + for i in range(0,len(files)): + if files[i].lower().endswith('.csv'): + try: + if files[i][-5] == ")": + oldname = path+'/'+files[i] + newname = path+'/report'+files[i][-6]+'.csv' + os.rename(oldname,newname) + except OSError: + pass + + quarterly_files = [fn for fn in os.listdir(path) if fn.lower().startswith('report')] + + for file in quarterly_files: + shutil.move(path+"/"+file,path+'/'+scrapings_dir) + + full_path = path+'/'+scrapings_dir + newfiles = copy.deepcopy(os.listdir(full_path)) + + for i in range(0,len(newfiles)): + oldname = full_path+'/'+newfiles[i] + if os.path.getsize(oldname) < 800: + print 'File '+oldname+' is unusually small...' + newname = full_path+'/'+str(os.path.getmtime(full_path+'/'+newfiles[i]))[:-2]+".csv" + os.rename(oldname, newname) + + + +### move files from Downloads folder to /gt_keywords +### rename files (they can be distinguished by date modified) + +### if a quarter only returns weekly data, pad out the weeks by steps: + + ## check weekly or not by filesize + ## for weekly files: + ## replce week ranges by week start dates + ## insert six rows between each row, pad with intervening dates and copy values down + ## overwrite files + +### copy all these into a single daily list for the full date range + +# download weekly data for full period + +def ScrapeRangeWeekly(keyword, startmonth, startyear, endmonth, endyear): + months = 11-startmonth + (endyear - startyear - 1)*12 + endmonth + print 'Scraping weekly data from '+month_list[startmonth-1]+' '+str(startyear)+' to '+month_list[endmonth-1]+' '+str(endyear) + + URL_start = "http://www.google.com/trends/trendsReport?&q=" + URL_end = "&cmpt=q&content=1&export=1" + + queries = keyword[0] + if len(keyword) > 1: + queries_list = [] + for i in range(0,len(keyword)): + queries_list.append(keyword[i]) + queries = '%20'.join(queries_list) + + date = '&date='+str(startmonth)+'%2F'+str(startyear)+'%20'+str(months)+'m' + + URL = URL_start+queries+date+URL_end + + webbrowser.open(URL) + + time.sleep(7) + + oldname = path+'/'+'report.csv' + newname = path+'/'+'weekly_data.csv' + os.rename(oldname,newname) + + shutil.move(newname,path+'/'+scrapings_dir) + +def CreateDailyFrame(): + + files = copy.deepcopy(os.listdir(path+'/'+scrapings_dir))[:-1] + + date_pattern = re.compile('\d\d\d\d-\d\d-\d\d') + for i in range(0,len(files)): + if files[i].lower().endswith('.csv'): + oldname = path+'/'+scrapings_dir+'/'+files[i] + newname = path+'/'+scrapings_dir+'/'+'bimonthly'+str(i)+'.csv' + temp_file = csv.reader(open(oldname,'ru')) + with open(newname,'wb') as write_to: + write_data = csv.writer(write_to, delimiter=',') + for row in temp_file: + if len(row)==2: + if re.search(date_pattern,row[0]) is not None: + write_data.writerows([row]) + os.remove(oldname) + + files = [fn for fn in copy.deepcopy(os.listdir(path+'/'+scrapings_dir))[:-1] if fn.lower().startswith('bimonthly')] + + frames_list = [] + + for file in files: + df = pd.read_csv(path+'/'+scrapings_dir+'/'+file,index_col=None,header=None) + frames_list.append(df) + + frame = pd.concat(frames_list,ignore_index=True) + return frame + +def CreateWeeklyFrame(): + + date_pattern = re.compile('\d\d\d\d-\d\d-\d\d\s-\s\d\d\d\d-\d\d-\d\d') + + oldname = path+'/'+scrapings_dir+'/'+'weekly_data.csv' + newname = path+'/'+scrapings_dir+'/'+'weekly.csv' + temp_file = csv.reader(open(oldname,'ru')) + with open(newname,'wb') as write_to: + write_data = csv.writer(write_to, delimiter=',') + for row in temp_file: + if len(row) == 2: + if re.search(date_pattern,row[0]) is not None: + write_data.writerows([row]) + os.remove(oldname) + + frame = pd.read_csv(newname,index_col=None,header=None) + return frame + +def StitchFrames(): + + daily_frame = CreateDailyFrame() + interim_weekly_frame = CreateWeeklyFrame() + + daily_frame.columns = ['Date', 'Daily_Volume'] + pd.to_datetime(daily_frame['Date']) + + interim_weekly_frame.columns = ['Date_Range', 'Weekly_Volume'] + date_pattern = re.compile('\d\d\d\d-\d\d-\d\d') + + startdates = [] + enddates = [] + + for i in range(0,len(interim_weekly_frame['Date_Range'])): + startdates.append(re.findall(date_pattern,interim_weekly_frame['Date_Range'][i])[0]) + enddates.append(re.findall(date_pattern,interim_weekly_frame['Date_Range'][i])[1]) + + weekly_frame = pd.DataFrame(data=[startdates,enddates,interim_weekly_frame['Weekly_Volume'].tolist()]).transpose() + weekly_frame.columns = ['Start_Date', 'End_Date', 'Weekly_Volume'] + pd.to_datetime(weekly_frame['Start_Date']) + pd.to_datetime(weekly_frame['End_Date']) + + bins = [] + + for i in range(0,len(weekly_frame)): + bins.append(pd.date_range(weekly_frame['Start_Date'][i],periods=7,freq='d')) + + weekly_frame = weekly_frame.set_index('Start_Date') + + daily_frame = daily_frame.set_index('Date') + + final_data = {} + + for i in range(0,len(bins)): + for j in range(0,len(bins[i])): + final_data[bins[i][j]] = weekly_frame['Weekly_Volume'][str(bins[i][0].date())]*daily_frame['Daily_Volume'][str(bins[i][j].date())]/daily_frame['Daily_Volume'][str(bins[i][0].date())] + + final_data_frame = DataFrame.from_dict(final_data,orient='index').sort() + final_data_frame[0] = np.round(final_data_frame[0]/final_data_frame[0].max()*100) + + final_data_frame.columns=['Volume'] + final_data_frame.index.names = ['Date'] + + final_name = path+'/'+scrapings_dir+'/'+'final_output.csv' + + final_data_frame.to_csv(final_name, sep=',') + +## run as python gt_scraper.py startmonth startyear endmonth endyear keyword_list + +if __name__ == '__main__': + + print sys.argv[0] + + startmonth = sys.argv[1] + startyear = sys.argv[2] + endmonth = sys.argv[3] + endyear = sys.argv[4] + keywords = sys.argv[5:] + + path = '/Users/clintonboys/Downloads' + + scrapings_dir = 'gt_{0}'.format(keywords[0]) + if not os.path.exists(path+"/"+scrapings_dir): + os.makedirs(path+"/"+scrapings_dir) + + ScrapeRange(keywords, int(startmonth), int(startyear), int(endmonth), int(endyear)) + ScrapeRangeWeekly(keywords, int(startmonth), int(startyear), int(endmonth), int(endyear)) + + StitchFrames() + +