-- IPython and Jupyter notebook
Every cell has an In [] label. If there's nothing between the braces, it means that cell has never been executed. If there is a number, it means that the cell has been executed, and the number represents the order in which the cell was executed. Finally, a * means that the cell is currently being executed.
$ pip install jupyter pandas matplotlib fake-factory delorean xlwt
$ jupyter notebook
-- Dealing with data
Setting up the notebook
#1
import json
import calendar
import random
from datetime import date, timedelta
import faker
import numpy as np
from pandas import DataFrame
from delorean
import parse
import pandas as pd
# make the graphs nicer
pd.set_option('display.mpl_style', 'default')
Preparing the data
#2
fake = faker.Faker()
#3
usernames = set()
usernames_no = 1000
# populate the set with 1000 unique usernames
while len(usernames) < usernames_no:
usernames.add(fake.user_name())
#4
def get_random_name_and_gender():
skew = .6 # 60% of users will be female
male = random.random() > skew
if male: return fake.name_male(), 'M'
else: return fake.name_female(), 'F'
def get_users(usernames):
users = []
for username in usernames:
name, gender = get_random_name_and_gender()
user = {
'username': username,
'name': name,
'gender': gender,
'email': fake.email(),
'age': fake.random_int(min=18, max=90),
'address': fake.address(),
}
users.append(json.dumps(user))
return users
users = get_users(usernames)
users[:3]
Out #4
['{"gender": "F", "age": 48, "email": "jovani.dickinson@gmail.com", "address": "2006 Sawayn Trail Apt. 207\\nHyattview, MO 27278", "username": "darcy00", "name": "Virgia Hilpert"}',
'{"gender": "F", "age": 58, "email": "veum.javen@hotmail.com", "address": "5176 Andres Plains Apt. 040\\nLakinside, GA 92446", "username": "renner.virgie", "name": "Miss Clarabelle Kertzmann MD"}',
'{"gender": "M", "age": 33, "email": "turner.felton@rippin.com", "address": "1218 Jacobson Fort\\nNorth Doctor, OK 04469", "username": "hettinger.alphonsus", "name": "Ludwig Prosacco"}']
#5
# campaign name format:
# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency
def get_type():
# just some gibberish internal codes
types = ['AKX', 'BYU', 'GRZ', 'KTR']
return random.choice(types)
def get_start_end_dates():
duration = random.randint(1, 2 * 365)
offset = random.randint(-365, 365)
start = date.today() - timedelta(days=offset)
end = start + timedelta(days=duration)
def _format_date(date_):
return date_.strftime("%Y%m%d")
return _format_date(start), _format_date(end)
def get_age():
age = random.randint(20, 45)
age -= age % 5
diff = random.randint(5, 25)
diff -= diff % 5
return '{}-{}'.format(age, age + diff)
def get_gender():
return random.choice(('M', 'F', 'B'))
def get_currency():
return random.choice(('GBP', 'EUR', 'USD'))
def get_campaign_name():
separator = '_'
type_ = get_type()
start_end = separator.join(get_start_end_dates())
age = get_age()
gender = get_gender()
currency = get_currency()
return separator.join(
(type_, start_end, age, gender, currency))
#6
def get_campaign_data():
name = get_campaign_name()
budget = random.randint(10**3, 10**6)
spent = random.randint(10**2, budget)
clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5))
impressions = int(random.gauss(0.5 * 10**6, 2))
return {
'cmp_name': name,
'cmp_bgt': budget,
'cmp_spent': spent,
'cmp_clicks': clicks,
'cmp_impr': impressions
}
#7
def get_data(users):
data = []
for user in users:
campaigns = [get_campaign_data()
for _ in range(random.randint(2, 8))]
data.append({'user': user, 'campaigns': campaigns})
return data
Cleaning the data
#8
rough_data = get_data(users)
rough_data[:2] # let's take a peek
[{'campaigns': [{'cmp_bgt': 130532,
'cmp_clicks': 25576,
'cmp_impr': 500001,
'cmp_name': 'AKX_20150826_20170305_35-50_B_EUR',
'cmp_spent': 57574},
... omit ...
{'cmp_bgt': 884396,
'cmp_clicks': 10955,
'cmp_impr': 499999,
'cmp_name': 'KTR_20151227_20151231_45-55_B_GBP',
'cmp_spent': 318887}],
'user': '{"age": 44, "username": "jacob43",
"name": "Holland Strosin",
"email": "humberto.leuschke@brakus.com",
"address": "1038 Runolfsdottir Parks\\nElmapo...",
"gender": "M"}'}]
#9
data = []
for datum in rough_data:
for campaign in datum['campaigns']:
campaign.update({'user': datum['user']})
data.append(campaign)data[:2] # let's take another peek
[{'cmp_bgt': 130532,
'cmp_clicks': 25576,
'cmp_impr': 500001,
'cmp_name': 'AKX_20150826_20170305_35-50_B_EUR',
'cmp_spent': 57574,
'user': '{"age": 44, "username": "jacob43",
"name": "Holland Strosin",
"email": "humberto.leuschke@brakus.com",
"address": "1038 Runolfsdottir Parks\\nElmaport...",
"gender": "M"}'}]
Creating the DataFrame
#10
df = DataFrame(data)df.head()
#11
df.count()
#12
df.describe()
#13
df.sort_index(by=['cmp_bgt'], ascending=False).head(3)
#14
df.sort_index(by=['cmp_bgt'], ascending=False).tail(3)
Unpacking the campaign name
#15
def unpack_campaign_name(name):
# very optimistic method, assumes data in campaign name
# is always in good state
type_, start, end, age, gender, currency = name.split('_')
start = parse(start).date
end = parse(end).date
return type_, start, end, age, gender, currency
campaign_data = df['cmp_name'].apply(unpack_campaign_name)
campaign_cols = [ 'Type', 'Start', 'End', 'Age', 'Gender', 'Currency']
campaign_df = DataFrame( campaign_data.tolist(), columns=campaign_cols, index=df.index)
campaign_df.head(3)
$16
df = df.join(campaign_df)
#17
df[['cmp_name'] + campaign_cols].head(3)
Unpacking the user data
#18
def unpack_user_json(user):
# very optimistic as well, expects user objects
# to have all attributes
user = json.loads(user.strip())
return [
user['username'],
user['email'],
user['name'],
user['gender'],
user['age'],
user['address'],
]
user_data = df['user'].apply(unpack_user_json)
user_cols = [ 'username', 'email', 'name', 'gender', 'age', 'address']
user_df = DataFrame(
user_data.tolist(),
columns=user_cols,
index=df.index)
df = df.join(user_df)
df[['user'] + user_cols].head(2)
#21
better_columns = [
'Budget', 'Clicks', 'Impressions',
'cmp_name', 'Spent', 'user',
'Type', 'Start', 'End', 'Target Age',
'Target Gender', 'Currency',
'Username', 'Email', 'Name',
'Gender', 'Age', 'Address',
]
df.columns = better_columns
#22
def calculate_extra_columns(df):
# Click Through Rate
df['CTR'] = df['Clicks'] / df['Impressions']
# Cost Per Click
df['CPC'] = df['Spent'] / df['Clicks']
# Cost Per Impression
df['CPI'] = df['Spent'] / df['Impressions']
calculate_extra_columns(df)
#23
df[['Spent', 'Clicks', 'Impressions', 'CTR', 'CPC', 'CPI']].head(3)
#24
clicks = df['Clicks'][0]
impressions = df['Impressions'][0]
spent = df['Spent'][0]
CTR = df['CTR'][0]
CPC = df['CPC'][0]
CPI = df['CPI'][0]
print('CTR:', CTR, clicks / impressions)
print('CPC:', CPC, spent / clicks)
print('CPI:', CPI, spent / impressions)
#25
def get_day_of_the_week(day):
number_to_day = dict(enumerate(calendar.day_name, 1))
return number_to_day[day.isoweekday()]
def get_duration(row):
return (row['End'] - row['Start']).days
df['Day of Week'] = df['Start'].apply(get_day_of_the_week)
df['Duration'] = df.apply(get_duration, axis=1)
#26
df[['Start', 'End', 'Duration', 'Day of Week']].head(3)
Cleaning everything up
#27
final_columns = [ 'Type', 'Start', 'End', 'Duration', 'Day of Week', 'Budget', 'Currency', 'Clicks', 'Impressions', 'Spent', 'CTR', 'CPC', 'CPI', 'Target Age', 'Target Gender', 'Username', 'Email', 'Name', 'Gender', 'Age']
df = df[final_columns]
Saving the DataFrame to a file
#28
df.to_csv('df.csv')
#29
df.to_json('df.json')
#30
df.to_excel('df.xls')
Visualizing the results
# make the graphs nicer
pd.set_option('display.mpl_style', 'default')
#31
%matplotlib inline
#32
import pylab
pylab.rcParams.update({'font.family' : 'serif'})
#33
df.describe()
#34
df[['Budget', 'Spent', 'Clicks', 'Impressions']].hist( bins=16, figsize=(16, 6));
#35
df[['CTR', 'CPC', 'CPI']].hist( bins=20, figsize=(16, 6));
#36
mask = (df.Spent > 0.75 * df.Budget)
df[mask][['Budget', 'Spent', 'Clicks', 'Impressions']].hist( bins=15, figsize=(16, 6), color='g');
#37
df_weekday = df.groupby(['Day of Week']).sum()
df_weekday[['Impressions', 'Spent', 'Clicks']].plot( figsize=(16, 6), subplots=True);
#38
agg_config = {
'Impressions': {
'Mean Impr': 'mean',
'Std Impr': 'std', },
'Spent': ['mean', 'std'],}
df.groupby(['Target Gender', 'Target Age']).agg(agg_config)
#39
pivot = df.pivot_table(
values=['Impressions', 'Clicks', 'Spent'],
index=['Target Age'],
columns=['Target Gender'],
aggfunc=np.sum)
pivot
-- Where do we go from here?
-- Summary
No comments:
Post a Comment