Friday, November 11, 2016

2 - Scale Google Trend Data


input.txt
BOMBER JACKET
DENIM JACKET
Band jacket
TRENCH COAT
DENIM VEST
utility jacket
military jacket
moto jacket
asymmetrical jacket
Robe coat

#################### Individual Keyword Data - individual.py #####################
from pytrends.request import TrendReq
import json
import time
import random
import numpy as np

fhand=open('input.txt')

idx=0
all_dfs = {}
for val in fhand:
pytrend = TrendReq("***@gmail.com", "***", custom_useragent='My Pytrends Script')
time.sleep(random.randint(5,10))
tmp=val.rstrip()
trend_payload = {'q': [tmp],'date': 'today 36-m'}
df = pytrend.trend(trend_payload, return_type='dataframe')
dates = np.asarray(df.index.strftime("%Y-%m-%d"))
df.index = dates
df.to_csv(str(idx)+".csv")
idx=idx+1
print idx, tmp
print "Done"

####################### Step 1 Fetch Data - step1.py ############################

from pytrends.request import TrendReq
from itertools import permutations
import numpy as np
import pandas as pd
import json
import time
import pickle
import datetime
import random

keywords = []

fhand=open('input.txt')
for val in fhand:
keywords.append(val.rstrip())

all_pairs = list(permutations(keywords, 2))
temp_pairs = [tuple(sorted(pair)) for pair in all_pairs]
all_pairs = list(set(temp_pairs))

average_ratios = {}
all_dfs = {}

#i = 0
i = 24
for pair in all_pairs[i:]:
#pytrend = TrendReq("***@gmail.com", "***", custom_useragent='My Pytrends Script')

time.sleep(random.randint(5,10))
keyword = ','.join(pair)
print ("index: ", i, keyword)
i += 1
df = pytrend.trend({'q': keyword, 'date':'today 7-m'}, return_type='dataframe')

dates = np.asarray(df.index.strftime("%Y-%m-%d"))
df.index = dates
df.to_csv("pair "+str(i)+".csv")

from pytrends.request import TrendReq
from itertools import permutations
import numpy as np
import pandas as pd
import json
import time
import pickle
import datetime
import random

#################### Step 2 Brute Force step2.py ####################################

keywords = []
all_dfs = {}
average_ratios = {}

fhand=open('input.txt')

for val in fhand:
tmp=val.rstrip().lower()
keywords.append(tmp)

all_pairs = list(permutations(keywords, 2))
temp_pairs = [tuple(sorted(pair)) for pair in all_pairs]
all_pairs = list(set(temp_pairs))

for i in range(len(all_pairs)):
  #df = pickle.load(open("data_pytrends_jeans_new/temp" + str(i) + ".p", "rb"))
  df=pd.read_csv("pair "+str(1+i)+".csv", index_col=[0], nrows=30)
  all_dfs[tuple(df.columns.values.tolist())] = df

##Compute Average Ratios and Ranking Keywords according to
# of their ratios less than 1 ######################################
# average_ratios is stored in both direction. e.g. if k1 / k2 = 0.5. Then I store average_ratios[k1][k2] = 0.5 and average_ratios[k2][k1] = 2

for pair in all_pairs:
df = all_dfs[pair]
first = pair[0]
second = pair[1]

average_ratio_0 = np.mean(df.apply(lambda x: np.divide(x[0] * 1.0, x[1]), axis=1).values)
average_ratio_1 = np.mean(df.apply(lambda x: np.divide(x[1] * 1.0, x[0]), axis=1).values)

if first not in average_ratios.keys():
average_ratios[first] = dict()

if second not in average_ratios.keys():
average_ratios[second] = dict()

average_ratios[first][second] = average_ratio_0
average_ratios[second][first] = average_ratio_1

# Compute # of ratios less than 1 for each keyword.
times_keywords_ratio_less_than_one = []
for keyword in keywords:
all_ratios = list(average_ratios[keyword].values())
times_keywords_ratio_less_than_one.append([keyword, sum([1 for r in all_ratios if r < 1])])

times_keywords_ratio_less_than_one = sorted(times_keywords_ratio_less_than_one, key=lambda x: x[1], reverse=True)


###### Renormalization ############################################################
all_pairs = list(all_dfs.keys())
new_keywords = [k[0] for k in times_keywords_ratio_less_than_one]
j = 2

result = pd.DataFrame(None)
dates = list(all_dfs.values())[0].index

while j < len(keywords):
lowest_keyword = new_keywords[j-2]
second_lowest_keyword = new_keywords[j-1]
third_lowest_keyword = new_keywords[j]

print (lowest_keyword, second_lowest_keyword, third_lowest_keyword)

pair_0 = tuple(sorted([lowest_keyword, second_lowest_keyword]))
pair_1 = tuple(sorted([second_lowest_keyword, third_lowest_keyword]))
pair_2 = tuple(sorted([lowest_keyword, third_lowest_keyword]))

temp_df_0 = all_dfs[pair_0]
temp_df_1 = all_dfs[pair_1]
temp_df_2 = all_dfs[pair_2]

if j == 2:
result = pd.concat([result, temp_df_0], axis=1)
result.index = dates


arr_0 = temp_df_0[second_lowest_keyword].values
arr_1 = temp_df_1[second_lowest_keyword].values


ratios = arr_0 / arr_1

# Renormalize lower ranking keywords by the ratios between the overlap keyword between result and df for the pair (overlap keyword, higher ranking keyword)
# Then concat with higher ranking keyword.
result = result.apply(lambda x: x / ratios, axis=0)
result = result.drop(second_lowest_keyword, 1)
result = pd.concat([result, temp_df_1], axis=1)
j += 1

result.to_csv("keywords_renormalization.csv")
print 'Done'

No comments:

Post a Comment

Blog Archive