Wednesday, March 30, 2016

Learning Python 9 - Data Science

-- IPython and Jupyter notebook

Every cell has an In [] label. If there's nothing between the braces, it means that cell has never been executed. If there is a number, it means that the cell has been executed, and the number represents the order in which the cell was executed. Finally, a * means that the cell is currently being executed.

$ pip install jupyter pandas matplotlib fake-factory delorean xlwt

$ jupyter notebook

-- Dealing with data

Setting up the notebook

#1

import json
import calendar
import random
from datetime import date, timedelta

import faker
import numpy as np
from pandas import DataFrame
from delorean 
import parse
import pandas as pd
# make the graphs nicer
pd.set_option('display.mpl_style', 'default')


Preparing the data

#2

fake = faker.Faker()

#3

usernames = set()
usernames_no = 1000
# populate the set with 1000 unique usernames
while len(usernames) < usernames_no:    
usernames.add(fake.user_name())

#4

def get_random_name_and_gender():    
skew = .6  # 60% of users will be female    
male = random.random() > skew    
if male:        return fake.name_male(), 'M'    
else:        return fake.name_female(), 'F'

def get_users(usernames):    
users = []    
for username in usernames:        
name, gender = get_random_name_and_gender()        
user = {            
'username': username,            
'name': name,            
'gender': gender,            
'email': fake.email(),            
'age': fake.random_int(min=18, max=90),            
'address': fake.address(),        
}        
users.append(json.dumps(user))    
return users
users = get_users(usernames)
users[:3]

Out #4

['{"gender": "F", "age": 48, "email": "jovani.dickinson@gmail.com", "address": "2006 Sawayn Trail Apt. 207\\nHyattview, MO 27278", "username": "darcy00", "name": "Virgia Hilpert"}',
 '{"gender": "F", "age": 58, "email": "veum.javen@hotmail.com", "address": "5176 Andres Plains Apt. 040\\nLakinside, GA 92446", "username": "renner.virgie", "name": "Miss Clarabelle Kertzmann MD"}',
 '{"gender": "M", "age": 33, "email": "turner.felton@rippin.com", "address": "1218 Jacobson Fort\\nNorth Doctor, OK 04469", "username": "hettinger.alphonsus", "name": "Ludwig Prosacco"}']

#5 

# campaign name format:
# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency
def get_type():    
# just some gibberish internal codes    
types = ['AKX', 'BYU', 'GRZ', 'KTR']    
return random.choice(types)

def get_start_end_dates():    
duration = random.randint(1, 2 * 365)    
offset = random.randint(-365, 365)    
start = date.today() - timedelta(days=offset)   
end = start + timedelta(days=duration)        
def _format_date(date_):        
return date_.strftime("%Y%m%d")        
return _format_date(start), _format_date(end)

def get_age():    
age = random.randint(20, 45)    
age -= age % 5    
diff = random.randint(5, 25)    
diff -= diff % 5    
return '{}-{}'.format(age, age + diff)

def get_gender():    
return random.choice(('M', 'F', 'B'))

def get_currency():    
return random.choice(('GBP', 'EUR', 'USD'))

def get_campaign_name():    
separator = '_'    
type_ = get_type()    
start_end = separator.join(get_start_end_dates())    
age = get_age()    
gender = get_gender()    
currency = get_currency()    
return separator.join(        
(type_, start_end, age, gender, currency))

#6

def get_campaign_data():
    name = get_campaign_name()
    budget = random.randint(10**3, 10**6)
    spent = random.randint(10**2, budget)    
    clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5))    
    impressions = int(random.gauss(0.5 * 10**6, 2))
    return {
        'cmp_name': name,
        'cmp_bgt': budget,
        'cmp_spent': spent,
        'cmp_clicks': clicks,
        'cmp_impr': impressions
    }

#7

def get_data(users):    
data = []    
for user in users:        
campaigns = [get_campaign_data()                     
for _ in range(random.randint(2, 8))]        
data.append({'user': user, 'campaigns': campaigns})    
return data

Cleaning the data

#8

rough_data = get_data(users)
rough_data[:2]  # let's take a peek

[{'campaigns': [{'cmp_bgt': 130532,    
'cmp_clicks': 25576,    
'cmp_impr': 500001,    
'cmp_name': 'AKX_20150826_20170305_35-50_B_EUR',    
'cmp_spent': 57574},   
... omit ...   
{'cmp_bgt': 884396,    
'cmp_clicks': 10955,    
'cmp_impr': 499999,    
'cmp_name': 'KTR_20151227_20151231_45-55_B_GBP',    
'cmp_spent': 318887}],  
'user': '{"age": 44, "username": "jacob43",            
"name": "Holland Strosin",            
"email": "humberto.leuschke@brakus.com",            
"address": "1038 Runolfsdottir Parks\\nElmapo...",            
"gender": "M"}'}]

#9

data = []
for datum in rough_data:    
for campaign in datum['campaigns']:        
campaign.update({'user': datum['user']})        
data.append(campaign)data[:2]  # let's take another peek

[{'cmp_bgt': 130532,
  'cmp_clicks': 25576,
  'cmp_impr': 500001,
  'cmp_name': 'AKX_20150826_20170305_35-50_B_EUR',
  'cmp_spent': 57574,
  'user': '{"age": 44, "username": "jacob43",
            "name": "Holland Strosin",
            "email": "humberto.leuschke@brakus.com",
            "address": "1038 Runolfsdottir Parks\\nElmaport...",
            "gender": "M"}'}]

Creating the DataFrame

#10
df = DataFrame(data)df.head()

#11
df.count()

#12
df.describe()

#13
df.sort_index(by=['cmp_bgt'], ascending=False).head(3)

#14 
df.sort_index(by=['cmp_bgt'], ascending=False).tail(3)

Unpacking the campaign name

#15
def unpack_campaign_name(name):    
# very optimistic method, assumes data in campaign name    
# is always in good state    
type_, start, end, age, gender, currency = name.split('_')    
start = parse(start).date    
end = parse(end).date    
return type_, start, end, age, gender, currency

campaign_data = df['cmp_name'].apply(unpack_campaign_name)
campaign_cols = [    'Type', 'Start', 'End', 'Age', 'Gender', 'Currency']
campaign_df = DataFrame(    campaign_data.tolist(), columns=campaign_cols, index=df.index)
campaign_df.head(3)

$16

df = df.join(campaign_df)

#17

df[['cmp_name'] + campaign_cols].head(3)

Unpacking the user data

#18

def unpack_user_json(user):    
# very optimistic as well, expects user objects   
# to have all attributes    
user = json.loads(user.strip())    
return [        
user['username'],        
user['email'],        
user['name'],        
user['gender'],        
user['age'],        
user['address'],    
]

user_data = df['user'].apply(unpack_user_json)
user_cols = [    'username', 'email', 'name', 'gender', 'age', 'address']
user_df = DataFrame(    
user_data.tolist(), 
columns=user_cols, 
index=df.index)

df = df.join(user_df)
df[['user'] + user_cols].head(2)

#21

better_columns = [    
'Budget', 'Clicks', 'Impressions',    
'cmp_name', 'Spent', 'user',    
'Type', 'Start', 'End',    'Target Age', 
'Target Gender', 'Currency',    
'Username', 'Email', 'Name',    
'Gender', 'Age', 'Address',
]

df.columns = better_columns

#22

def calculate_extra_columns(df):    
# Click Through Rate    
df['CTR'] = df['Clicks'] / df['Impressions']    
# Cost Per Click    
df['CPC'] = df['Spent'] / df['Clicks']    
# Cost Per Impression    
df['CPI'] = df['Spent'] / df['Impressions']
calculate_extra_columns(df)

#23

df[['Spent', 'Clicks', 'Impressions',    'CTR', 'CPC', 'CPI']].head(3)

#24

clicks = df['Clicks'][0]
impressions = df['Impressions'][0]
spent = df['Spent'][0]
CTR = df['CTR'][0]
CPC = df['CPC'][0]
CPI = df['CPI'][0]
print('CTR:', CTR, clicks / impressions)
print('CPC:', CPC, spent / clicks)
print('CPI:', CPI, spent / impressions)

#25

def get_day_of_the_week(day):    
number_to_day = dict(enumerate(calendar.day_name, 1))    
return number_to_day[day.isoweekday()]

def get_duration(row):    
return (row['End'] - row['Start']).days

df['Day of Week'] = df['Start'].apply(get_day_of_the_week)
df['Duration'] = df.apply(get_duration, axis=1)

#26

df[['Start', 'End', 'Duration', 'Day of Week']].head(3)

Cleaning everything up

#27
final_columns = [    'Type', 'Start', 'End', 'Duration', 'Day of Week', 'Budget',    'Currency', 'Clicks', 'Impressions', 'Spent', 'CTR', 'CPC',    'CPI', 'Target Age', 'Target Gender', 'Username', 'Email',    'Name', 'Gender', 'Age']

df = df[final_columns]

Saving the DataFrame to a file

#28

df.to_csv('df.csv')

#29

df.to_json('df.json')

#30

df.to_excel('df.xls')

Visualizing the results

# make the graphs nicer
pd.set_option('display.mpl_style', 'default')

#31

%matplotlib inline

#32

import pylab

pylab.rcParams.update({'font.family' : 'serif'})

#33

df.describe()

#34

df[['Budget', 'Spent', 'Clicks', 'Impressions']].hist(    bins=16, figsize=(16, 6));

#35

df[['CTR', 'CPC', 'CPI']].hist(    bins=20, figsize=(16, 6));

#36

mask = (df.Spent > 0.75 * df.Budget)
df[mask][['Budget', 'Spent', 'Clicks', 'Impressions']].hist(    bins=15, figsize=(16, 6), color='g');

#37

df_weekday = df.groupby(['Day of Week']).sum()
df_weekday[['Impressions', 'Spent', 'Clicks']].plot(    figsize=(16, 6), subplots=True);

#38

agg_config = {    
'Impressions': {        
'Mean Impr': 'mean',        
'Std Impr': 'std',    },    

'Spent': ['mean', 'std'],}
df.groupby(['Target Gender', 'Target Age']).agg(agg_config)

#39

pivot = df.pivot_table(    
values=['Impressions', 'Clicks', 'Spent'],    
index=['Target Age'],    
columns=['Target Gender'],    
aggfunc=np.sum)
pivot

-- Where do we go from here?

-- Summary


No comments:

Post a Comment

Blog Archive