My DS Coding Bolg: February 2016

Monday, February 29, 2016

Python Data Analysis 5 - pandas: Reading and Writing Data

I/O API Tools

-- CSV and textual Files

read_csv

read_table

to_csv

-- Reading Data in CSV or Text Files

csvframe = read_csv(ímyCSV_01.csví)

csvframe

read_table(ích05_01.csví,sep=í,í)

read_csv(ích05_02.csví)

read_csv(ích05_02.csví, header=None)

read_csv(ích05_02.csví, names=[íwhiteí,íredí,íblueí,ígreení,íanimalí])

read_csv(ích05_03.csví, index_col=[ícolorí,ístatusí])

-- Using RegExp for Parsing TXT Files

read_table(ích05_04.txtí,sep=í\s*í)

read_table(ích05_05.txtí,sep=í\D*í,header=None)

read_table(ích05_06.txtí,sep=í,í,skiprows=[0,1,3,6])

-- Reading TXT Files into Parts or Partially

read_csv(ích05_02.csví,skiprows=[2],nrows=3,header=None)

out = Series()

i=0

pieces = read_csv(ích05_01.csví,chunksize=3)

for piece in pieces:

out.set_value(i,piece[íwhiteí].sum())

i = i + 1

-- Writing Data in CSV

frame2

frame2.to_csv('ch05_07.csv')

frame2.to_csv(ích05_07b.csví, index=False, header=False)

frame3

frame3.to_csv(ích05_09.csví, na_rep =íNaNí)

-- Reading and Writing HTML Files

read_html

to_html

-- Writing Data in HTML

frame = pd.DataFrame(np.arange(4).reshape(2,2))

print(frame.to_html())

frame = pd.DataFrame( np.random.random((4,4)),

index = [íwhiteí,íblackí,íredí,íblueí],

columns = [íupí,ídowní,írightí,íleftí])

s = [í<HTML>í]

s.append(í<HEAD><TITLE>My DataFrame</TITLE></HEAD>í)

s.append(í<BODY>í)

s.append(frame.to_html())

s.append(í</BODY></HTML>í)

html = íí.join(s)

html_file = open(ímyFrame.htmlí,íwí)

html_file.write(html)

html_file.close()

-- Reading Data from an HTML File

web_frames = pd.read_html(ímyFrame.htmlí)

web_frames[0]

ranking = pd.read_html(íhttp://www.meccanismocomplesso.org/en/meccanismo-complesso-sito-2/classifica-punteggio/í)

ranking[0]

-- Reading Data from XML

from lxml import objectify

xml = objectify.parse(íbooks.xmlí)

xml

root = xml.getroot()

root.Book.Author

root.Book.PublishDate

root.getchildren()

[child.tag for child in root.Book.getchildren()]

[child.text for child in root.Book.getchildren()]

def etree2df(root):

column_names = []

for i in range(0,len(root.getchildren()[0].getchildren())):

column_names.append(root.getchildren()[0].getchildren()[i].tag)

xml:frame = pd.DataFrame(columns=column_names)

for j in range(0, len(root.getchildren())):

obj = root.getchildren()[j].getchildren()

texts = []

for k in range(0, len(column_names)):

texts.append(obj[k].text)

row = dict(zip(column_names, texts))

row_s = pd.Series(row)

row_s.name = j

xml:frame = xml:frame.append(row_s)

return xml:frame

etree2df(root)

-- Reading and Writing Data on Microsoft Excel Files

to_excel()

read_excel()

pd.read_excel(ídata.xlsí)

pd.read_excel(ídata.xlsí,íSheet2í)

pd.read_excel(ídata.xlsí,1)

frame = pd.DataFrame(np.random.random((4,4)),

index = [íexp1í,íexp2í,íexp3í,íexp4í],

columns = [íJan2015í,íFab2015í,íMar2015í,íApr2005í])

frame

frame.to_excel('data2.xlsx')

-- JSON Data

read_json

to_json

frame = pd.DataFrame(np.arange(16).reshape(4,4),

index=[íwhiteí,íblackí,íredí,íblueí],

columns=[íupí,ídowní,írightí,íleftí])

frame.to_json(íframe.jsoní)

pd.read_json('frame.json')

from pandas.io.json import json_normalize

file = open(íbooks.jsoní,írí)

text = file.read()

text = json.loads(text)

json_normalize(text,íbooksí)

json_normalize(text2,íbooksí,[íwriterí,ínationalityí])

-- The Format HDF5

from pandas.io.pytables import HDFStore

frame = pd.DataFrame(np.arange(16).reshape(4,4),

index=[íwhiteí,íblackí,íredí,íblueí],

columns=[íupí,ídowní,írightí,íleftí])

store = HDFStore(ímydata.h5í)

store[íobj1í] = frame

frame2

store[íobj2í] = frame2

store

store['obj2']

-- PickleóPython Object Serialization

-- Serialize a Python Object with cPickle

import cPickle as pickle

data = { ícolorí: [íwhiteí,íredí], ívalueí: [5, 7]}

pickled_data = pickle.dumps(data)

print pickled_data

nframe = pickle.loads(pickled_data)

nframe

-- Pickling with pandas

frame = pd.DataFrame(np.arange(16).reshape(4,4), index = [íupí,ídowní,íleftí,írightí])

frame.to_pickle(íframe.pklí)

pd.read_pickle(íframe.pklí)

-- Interacting with Databases

from sqlalchemy import create_engine

engine = create_engine(ípostgresql://scott:tiger@localhost:5432/mydatabaseí)

engine = create_engine(ímysql+mysqldb://scott:tiger@localhost/fooí)

engine = create_engine(íoracle://scott:tiger@127.0.0.1:1521/sidnameí)

engine = create_engine(ímssql+pyodbc://mydsní)

engine = create_engine(ísqlite:///foo.dbí)

-- Loading and Writing Data with SQLite3

frame = pd.DataFrame( np.arange(20).reshape(4,5), columns=[íwhiteí,íredí,íblueí,íblackí,ígreení])

frame

engine = create_engine(ísqlite:///foo.dbí)

frame.to_sql(ícolorsí,engine)

pd.read_sql(ícolorsí,engine)

import sqlite3

query = """

CREATE TABLE test

(a VARCHAR(20), b VARCHAR(20),

c REAL, d INTEGER

);"""

con = sqlite3.connect(í:memory:í)

con.execute(query)

con.commit()

data = [(íwhiteí,íupí,1,3),

(íblackí,ídowní,2,8),

(ígreení,íupí,4,4),

(íredí,ídowní,5,5)]

stmt = "INSERT INTO test VALUES(?,?,?,?)"

con.executemany(stmt, data)

con.commit()

cursor = con.execute(íselect * from testí)

cursor

rows = cursor.fetchall()

rows

cursor.description

pd.DataFrame(rows, columns=zip(*cursor.description)[0])

-- Loading and Writing Data with PostgreSQL

pd.__version__

engine = create_engine(ípostgresql://postgres:password@localhost:5432/postgresí)

frame = pd.DataFrame(np.random.random((4,4)),

index=[íexp1í,íexp2í,íexp3í,íexp4í],

columns=[ífebí,ímarí,íaprí,ímayí]);

frame.to_sql(ídataframeí,engine)

psql -U postgres

pd.read_sql_table(ídataframeí,engine)

pd.read_sql_query(íSELECT index,apr,may FROM DATAFRAME WHERE apr > 0.5í,engine)

-- Reading and Writing Data with a NoSQL Database: MongoDB

mongod --dbpath C:\MongoDB_data

import pymongo

client = MongoClient(ílocalhostí,27017)

db = client.mydatabase

collection = db.mycollection

db[ímycollectioní]

collection

frame = pd.DataFrame( np.arange(20).reshape(4,5),

columns=[íwhiteí,íredí,íblueí,íblackí,ígreení])

frame

import json

record = json.loads(frame.T.to_json()).values()

record

collection.mydocument.insert(record)

cursor = collection[ímydocumentí].find()

dataframe = (list(cursor))

del dataframe[í_idí]

dataframe

Conclusions

Thursday, February 25, 2016

Python Data Analysis 4 - The pandas Library - An Introduction

Pandas: The Python Data Analytics Library

Installation

Installation from Anaconda

Installation on Linux

-- Test Your pandas Installation

nosetests pandas

-- Getting Started with pandas

import pandas as pd

import numpy as np

from pandas import *

-- Introduction to pandas Data Structures

-- The Series

Declaring a Series

s = pd.Series([12,-4,7,9])

s = pd.Series([12,-4,7,9], index=[íaí,íbí,ící,ídí])

s.values

s.index

Selecting the Internal Elements

s[2]

s['b']

s[['b', 'c']]

Assigning Values to the Elements

s[1] = 0

s['b'] = 1

Defining Series from NumPy Arrays and Other Series

arr = np.array([1,2,3,4])

s3 = pd.Series(arr)

s4 = pd.Series(s)

arr[2] = -2

Filtering Values

s[s > 8]

Operations and Mathematical Functions

s/2

np.logs(s)

Evaluating Values

serd = pd.Series([1,0,2,1,2,3], index=[íwhiteí,íwhiteí,íblueí,ígreení,ígreení,íyellowí])

serd

serd.unique()

serd.value_counts()

serd.isin([0,3])

NaN Values

s2 = pd.Series([5,-3,np.NaN,14])

s2.isnull(?)

s2[s2.notnull(?)]

Series as Dictionaries

mydict = {íredí: 2000, íblueí: 1000, íyellowí: 500, íorangeí: 1000}

myseries = pd.Series(mydict)

colors = [íredí,íyellowí,íorangeí,íblueí,ígreení]

myseries = pd.Series(mydict, index=colors)

Operations between Series

mydict2 = {íredí:400,íyellowí:1000,íblackí:700}

myseries2 = pd.Series(mydict2)

myseries + myseries2

-- The DataFrame

Defining a DataFrame

data = {ícolorí : [íblueí,ígreení,íyellowí,íredí,íwhiteí],

íobjectí : [íballí,ípení,ípencilí,ípaperí,ímugí],

ípriceí : [1.2,1.0,0.6,0.9,1.7]}

frame = pd.DataFrame(data)

frame

frame2 = pd.DataFrame(data, columns=[íobjectí,ípriceí])

frame2

frame2 = pd.DataFrame(data, index=[íoneí,ítwoí,íthreeí,ífourí,ífiveí])

frame2

frame3 = pd.DataFrame(np.arange(16).reshape((4,4)),

index=[íredí,íblueí,íyellowí,íwhiteí],

columns=[íballí,ípení,ípencilí,ípaperí])

frame3

Selecting Elements

frame.columns

frame.index

frame.values

frame['price']

frame.ix[2]

frame.ix[[2,4]]

frame[0:1]

frame[1:3]

frame['object'][3]

Assigning Values

frame.index.name = íidí; frame.columns.name = íitemí

frame

frame[ínewí] = 12

frame

frame[ínewí] = [3.0,1.3,2.2,0.8,1.1]

frame

ser = pd.Series(np.arange(5))

ser

frame[ípriceí][2] = 3.3

Membership of a Value

frame.isin([1.0,ípení])

frame[frame.isin([1.0,ípení])]

Deleting a Column

del frame[ínewí]

frame

Filtering

frame[frame < 12]

DataFrame from Nested dict

nestdict = { íredí: { 2012: 22, 2013: 33 },

íwhiteí: { 2011: 13, 2012: 22; 2013: 16},

íblueí: {2011: 17, 2012: 27; 2013: 18}}

frame2 = pd.DataFrame(nestdict)

frame2

Transposition of a DataFrame

frame2.T

-- The Index Objects

ser = pd.Series([5,0,3,8,4], index=[íredí,íblueí,íyellowí,íwhiteí,ígreení])

ser.index

Methods on Index

ser.idxmin()

ser.idxmax()

Index with Duplicate Labels

serd = pd.Series(range(6), index=[íwhiteí,íwhiteí,íblueí,ígreení,ígreení,íyellowí])

serd

serd['white']

serd.index.is_unique

frame.index.is_unique

-- Other Functionalities on Indexes

ser = pd.Series([2,5,7,4], index=[íoneí,ítwoí,íthreeí,ífourí])

ser

ser.reindex([íthreeí,ífourí,ífiveí,íoneí])

ser3 = pd.Series([1,5,6,3],index=[0,3,5,6])

ser3

ser3.reindex(range(6),method=íffillí)

ser3.reindex(range(6),method=íbfillí)

frame.reindex(range(5), method=íffillí,columns=[ícolorsí,ípriceí,ínewí,íobjectí])

-- Dropping

drop()

ser = Series(np.arange(4.), index=[íredí,íblueí,íyellowí,íwhiteí])

ser

ser.drop(íyellowí)

ser.drop([íblueí,íwhiteí])

frame = pd.DataFrame(np.arange(16).reshape((4,4)),

index=[íredí,íblueí,íyellowí,íwhiteí],

columns=[íballí,ípení,ípencilí,ípaperí])

frame

frame.drop([íblueí,íyellowí])

frame.drop([ípení,ípencilí],axis=1)

-- Arithmetic and Data Alignment

s1 = pd.Series([3,2,5,1],[íwhiteí,íyellowí,ígreení,íblueí])

s2 = pd.Series([1,4,7,2,1],[íwhiteí,íyellowí,íblackí,íblueí,íbrowní])

s1+s2

frame1 = pd.DataFrame(np.arange(16).reshape((4,4)),

index=[íredí,íblueí,íyellowí,íwhiteí],

columns=[íballí,ípení,ípencilí,ípaperí])

frame2 = pd.DataFrame(np.arange(12).reshape((4,3)),

index=[íblueí,ígreení,íwhiteí,íyellowí],

columns=[ímugí,ípení,íballí])

frame1

frame2

frame1+frame2

-- Operations between Data Structures

Flexible Arithmetic Methods

frame1.add(frame2)

-- Operations between DataFrame and Series

frame = pd.DataFrame(np.arange(16).reshape((4,4)),

index=[íredí,íblueí,íyellowí,íwhiteí],

columns=[íballí,ípení,ípencilí,ípaperí])

frame

frame - ser

ser[ímugí] = 9

ser

frame - ser

-- Function Application and Mapping

Functions by Element

frame = pd.DataFrame(np.arange(16).reshape((4,4)),

index=[íredí,íblueí,íyellowí,íwhiteí],

columns=[íballí,ípení,ípencilí,ípaperí])

frame

np.sqrt(frame)

-- Functions by Row or Column

f = lambda x: x.max() - x.min()

def f(x):

return x.max() - x.min()

frame.apply(f)

frame.apply(f, axis=1)

def f(x):

return pd.Series([x.min(), x.max()], index=[íminí,ímaxí])

frame.apply(f)

-- Statistics Functions

frame.sum()

frame.mean()

frame.describe()

-- Sorting and Ranking

ser = pd.Series([5,0,3,8,4], index=[íredí,íblueí,íyellowí,íwhiteí,ígreení])

ser

ser.sort_index(ascending=False)

frame = pd.DataFrame(np.arange(16).reshape((4,4)),

index=[íredí,íblueí,íyellowí,íwhiteí],

columns=[íballí,ípení,ípencilí,ípaperí])

frame

ser.order()

frame.sort_index(by='pen')

frame.sort_index(by='pen', 'pencil'])

ser.rank()

ser.rank(method='first')

ser.rank(ascending=False)

-- Correlation and COvariance

seq2 = pd.Series([3,4,3,4,5,4,3,2],[í2006í,í2007í,í2008í,í2009í,í2010í,í2011í,í2012í,í2013í])

seq = pd.Series([1,2,3,4,4,3,2,1],[í2006í,í2007í,í2008í,í2009í,í2010í,í2011í,í2012í,í2013í])

seq.corr(seq2)

frame2 = DataFrame([[1,4,3,6],[4,5,6,1],[3,3,1,5],[4,1,6,4]],

index=[íredí,íblueí,íyellowí,íwhiteí],

columns=[íballí,ípení,ípencilí,ípaperí])

frame2

frame2.corr()

frame2.cov()

frame2.corrwith(ser)

frame2.corrwith(frame)

-- ìNot a Numberî Data

Assigning a NaN Value

ser = pd.Series([0,1,2,np.NaN,9], index=[íredí,íblueí,íyellowí,íwhiteí,ígreení])

ser

ser[íwhiteí] = None

ser

Filtering Out NaN Values

ser.dropna()

ser[ser.notnull()]

frame3 = pd.DataFrame([[6,np.nan,6],[np.nan,np.nan,np.nan],[2,np.nan,5]],

index = [íblueí,ígreení,íredí],

columns = [íballí,ímugí,ípení])

frame3

frame3.dropna(how=íallí)

-- Filling in NaN Occurrences

frame3.fillna(0)

frame3.fillna({íballí:1,ímugí:0,ípení:99})

-- Hierarchical Indexing and Leveling

mser = pd.Series(np.random.rand(8),

index=[[íwhiteí,íwhiteí,íwhiteí,íblueí,íblueí,íredí,íredí,íredí],

[íupí,ídowní,írightí,íupí,ídowní,íupí,ídowní,íleftí]])

mser

mser.index

mser[íwhiteí]

mser[:,íupí]

mser[íwhiteí,íupí]

mser.unstack()

frame

frame.stack()

mframe = pd.DataFrame(np.random.randn(16).reshape(4,4),

index=[[íwhiteí,íwhiteí,íredí,íredí], [íupí,ídowní,íupí,ídowní]],

columns=[[ípení,ípení,ípaperí,ípaperí],[1,2,1,2]])

mframe

-- Reordering and Sorting Levels

mframe.columns.names = [íobjectsí,íidí]

mframe.index.names = [ícolorsí,ístatusí]

mframe

mframe.swaplevel('colors', 'status')

mframe.sortlevel(ícolorsí)

-- Summary Statistic by Level

mframe.sum(level=ícolorsí)

mframe.sum(level=íidí, axis=1)

Tuesday, February 23, 2016

Python Data Analysis 3 - The NumPy Library

-- The NumPy Installation

sudo apt-get install python-numpy

import numpy as np

-- Ndarray: The Heart of the Library

a=np.arrary([1, 2, 3])

type(a)

a.dtype

a.ndim

a.size

a.shape

b = np.array([[1.3, 2.4],[0.3, 4.1]])

b.dtype

b.ndim

b.size

b.shape

b.itemsize

b.data

-- Create an Array

c = np.array([[1, 2, 3],[4, 5, 6]])

d = np.array(((1, 2, 3),(4, 5, 6)))

e = np.array([(1, 2, 3), [4, 5, 6], (7, 8, 9)])

-- Types of Data

g = np.array([[’a’, ’b’],[’c’, ’d’]])

g.dtype

g.dtype.name

-- The dtype Option

f = np.array([[1, 2, 3],[4, 5, 6]], dtype=complex)

-- Intrinsic Creation of an Array

np.zeros((3, 3))

np.ones((3, 3))

np.arange(0, 10)

np.arange(4, 10)

np.arange(0, 12, 3)

np.arange(0, 6, 0.6)

np.arange(0, 12).reshape(3, 4)

np.linspace(0,10,5)

np.random.random(3)

np.random.random((3,3))

-- Basic Operations

-- Arithmetic Operators

a = np.arange(4)

a+4

a*2

b = np.arange(4,8)

a+b

a-b

a*b

a * np.sin(b)

a * np.sqrt(b)

A = np.arange(0, 9).reshape(3, 3)

B = np.ones((3, 3))

A*B

-- The Matrix Product

np.dot(A,B)

A.dot(B)

np.dot(B,A)

-- Increment and Decrement Operators

a = np.arange(4)

a += 1

a -= 1

array([0, 1, 2, 3])

a += 4

a *= 2

-- Universal Functions (ufunc)

a = np.arange(1, 5)

np.sqrt(a)

np.log(a)

np.sin(a)

-- Aggregate Functions

a = np.array([3.3, 4.5, 1.2, 5.7, 0.3])

a.sum()

a.min()

a.max()

a.mean()

a.std()

-- Indexing, Slicing and Iterating

a = np.arange(10, 16)

a[4]

a[-1]

a[-6]

a[[1,3,4]]

A = np.arange(10, 19).reshape((3, 3))

A[1,2]

a = np.arange(10, 16)

a[1:5]

a[1:5:2]

a[::2]

a[:5:2]

a{:5:]

A = np.arange(10, 19).reshape((3, 3))

A[0,:]

A[0:2, 0:2]

A[[0,2], 0:2]

Iterating an Array

for i in a:

print a

for row in A;

print row

for item in A.flat;

print item

np.apply_along_axis(np.mean, axis=0, arr=A)

np.apply_along_axis(np.mean, axis=1, arr=A)

def foo(x):

return x/2

np.apply_along_axis(foo, axis=1, arr=A)

np.apply_along_axis(foo, axis=0, arr=A)

-- Conditions and Boolean Arrays

A = np.random.random((4, 4))

A<0.5

A[A<0.5]

-- Shape Manipulation

a = np.random.random(12)

A = a.reshape(3, 4)

a.shape = (3, 4)

a = a.ravel()

a.shape = (12)

A.transpose()

-- Array Manipulation

-- Joining Arrays

A = np.ones((3, 3))

B = np.zeros((3, 3))

np.vstack((A, B))

B = np.zeros((3, 3))

np.vstack((A, B))

a = np.array([0, 1, 2])

b = np.array([3, 4, 5])

c = np.array([6, 7, 8])

np.column_stack((a, b, c))

np.row_stack((a, b, c))

-- Splitting Arrays

A = np.arange(16).reshape((4, 4))

[B,C] = np.hsplit(A, 2)

[B,C] = np.vsplit(A, 2)

[A1,A2,A3] = np.split(A,[1,3],axis=1)

-- General Concepts

Copies or Views of Objects

a = np.array([1, 2, 3, 4])

b = a

a[2] = 0

c = a[0:2]

a[0] = 0

a = np.array([1, 2, 3, 4])

c = a.copy()

a[0]=0

-- Vectorization

a * b

A * B

for (i = 0; i < rows; i++){

c[i] = a[i]*b[i];

}

for( i=0; i < rows; i++){

for(j=0; j < columns; j++){

c[i][j] = a[i][j]*b[i][j];

}

-- Broadcasting

A = np.arange(16).reshape(4, 4)

b = np.arange(4)

A+b

m = np.arange(6).reshape(3, 1, 2)

m+n

-- Structured Arrays

structured = np.array([(1, ’First’, 0.5, 1+2j),(2, ’Second’, 1.3, 2-2j), (3, ’Third’, 0.8, 1+3j)],dtype=(’i2, a6, f4, c8’))

structured

structured = np.array([(1, ’First’, 0.5, 1+2j),(2, ’Second’, 1.3,2-2j), (3, ’Third’, 0.8, 1+3j)],dtype=(’int16, a6, float32, complex64’))

structured

structured[1]

structured[’f1’]

structured = np.array([(1,’First’,0.5,1+2j),(2,’Second’,1.3,2-2j),(3,’Third’,0.8,1+3j)],dtype=[(’id’,’i2’),(’position’,’a6’),(’value’,’f4’),(’complex’,’c8’)])

structured

structured.dtype.names = (’id’,’order’,’value’,’complex’)

structured[’order’]

Reading and Writing Array Data on Files

Loading and Saving Data in Binary Files

data

np.save(’saved_data’,data)

loaded_data = np.load(’saved_data.npy’)

loaded_data

-- Reading and Writing Array Data on Files

-- Loading and Saving Data in Binary Files

data = np.genfromtxt(’data.csv’, delimiter=’,’, names=True)

data

data2 = np.genfromtxt(’data2.csv’, delimiter=’,’, names=True)

data2

data2['id']

data2[0]

Monday, February 22, 2016

Python Data Analysis 2 - Introduction to the Python's World

Chap 2 Introduction to the Python's World


The Python interpreter is simply a program that reads and interprets the commands passed to the prompt. You have seen that the interpreter can accept either a single command at a time or entire files of Python code. However the approach by which it performs this is always the same.

Each time you press the Enter key, the interpreter begins to scan the code written (either a row or a full file of code) token by token (tokenization). These tokens are fragments of text which the interpreter will arrange in a tree structure. The tree obtained is the logical structure of the program which is then converted to bytecode (.pyc or .pyo). The process chain ends with the bytecode which will be executed by a Python virtual machine (PVM).

Using Python

myname = raw_input("What is your name? ")
print "Hi " + myname + ", Iím glad to say: Hello world!"

Wrinting Python Code

1 + 2
(1.045 * 3)/4
4 ** 2
((4 + 5j) * (2 + 3j))
4 < (2*3)
a = 12 * 3.4
a

import math
math.sin(a)
from math import *
sin(a)
from math import sin

Data Structure

list, set, strings, tuples, dictionary, deque, heap

# dictionary
dict = {ínameí:íWilliamí, íageí:25, ícityí:íLondoní}
dict["name"]

for key, value in dict.items():
 print(key,value)

list = [1,2,3,4]
list
list[2]
list[1:3]
list[-1]
items = [1,2,3,4,5]
for item in items:
 item + 1

# Functional Programming (Only for Python 3.4)

map(function, list)
filter(function, list)
reduce(function, list)
lambda
list comprehension

items = [1,2,3,4,5]
def inc(x): return x+1
list(map(inc,items))
list(map((lambda x: x+1),items))
list(filter((lambda x: x < 4), items))
from functools import reduce
reduce((lambda x,y: x/y), items)

S = [x**2 for x in range(5)] 
S

# Indentation

a = 4
if a > 3:
 if a < 5:
         print("Iím four")
 else:
  print("Iím a little number")

if a > 3:
 if a < 5:
  print("Iím four")
 else:
  print("Iím a big number")

IPython

IPython is a further development of Python that includes a number of tools: IPython shell, a powerful interactive shell resulting in a greatly enhanced Python terminal; a QtConsole, which is a hybrid between a shell and a GUI, allowing in this way to display graphics inside the console instead of in separate windows; and finally the IPython Notebook, which is a web interface that allows you to mix text, executable code, graphics, and formulas in a single representation.

In [1]: print "Hello World!"
Hello World! 

In [2]: 3/2
Out[2]: 1

In [3]: 5.0/2
Out[3]: 2.5

In [4]: In
Out[4]: [íí, uíprint "Hello World!"í, uí3/2í, uí5.0/2í, uí_i2í, uíIní]

In [5]: In[3]
Out[5]: uí5.0/2í

{2: 1,
 3: 2.5,
 4: [íí,
  uíprint "Hello World!"í,
  uí3/2í,
  uí5.0/2í,
  uí_i2í,
  uíIní,
  uíIn[3]í,
  uíOutí], 
 5: uí5.0/2í}

SciPy

SciPy (pronounced ìSigh Pieî) is a set of open-source Python libraries specialized for scientific computing. Many of these libraries will be the protagonists of many chapters of the book, given that their knowledge is critical to the data analysis. Together they constitute a set of tools for calculating and displaying data that has little to envy from other specialized environments for calculation and data analysis (such as R or Matlab). 

 NumPy is the foundation library for scientific computing
 Pandas provides complex data structures and functions specifically designed to make the work on them easy, fast, and effective.
 matplotlib is currently most popular for producing plots and other data visualizations in 2D.

Saturday, February 13, 2016

Python Data Analysis 1 - An Introduction to Data Analysis in Python

The Data Analysis Process

• Problem definition

The definition step and the corresponding documentation (deliverables) of the scientific problem or business are both very important in order to focus the entire analysis strictly on getting results. In fact, a comprehensive or exhaustive study of the system is sometimes complex and you do not always have enough information to start with. So the definition of the problem and especially its planning can determine uniquely the guidelines to follow for the whole project.

• Data extraction

When you want to get the data, a good place to start is just the Web. But most of the data on the Web can be difficult to capture; in fact, not all data are available in a file or database, but can be more or less implicitly content that is inside HTML pages in many different formats. To this end, a methodology called Web Scraping, which allows the collection of data through the recognition of specific occurrence of HTML tags within the web pages, has been developed. There are software specifically designed for this purpose, and once an occurrence is found, they extract the desired data. Once the search is complete, you will get a list of data ready to be subjected to the data analysis.

• Data preparation

The preparation of the data is concerned with obtaining, cleaning, normalizing, and transforming data into an optimized data set, that is, in a prepared format, normally tabular, suitable for the methods of analysis that have been scheduled during the design phase.

• Data exploration

Exploring the data is essentially the search for data in a graphical or statistical presentation in order to find patterns, connections, and relationships in the data. Data visualization is the best tool to highlight possible patterns.

Generally, the data analysis requires processes of summarization of statements regarding the data to be studied. The summarization is a process by which data are reduced to interpretation without sacrificing important information.

Clustering is a method of data analysis that is used to find groups united by common attributes (grouping).

Another important step of the analysis focuses on the identification of relationships, trends, and anomalies in the data. In order to find out this kind of information, one often has to resort to the tools as well as performing another round of data analysis, this time on the data visualization itself.

Other methods of data mining, such as decision trees and association rules, automatically extract important facts or rules from data. These approaches can be used in parallel with the data visualization to find information about the relationships between the data.

• Predictive modeling

Classification models: If the result obtained by the model type is categorical.

Regression models: If the result obtained by the model type is numeric.

Clustering models: If the result obtained by the model type is descriptive.

• Model validation/test

Generally, you will refer to the data as the training set, when you are using them for building the model, and as the validation set, when you are using them for validating the model.

• Deployment of the solution

The deployment basically consists of putting into practice the results obtained from the data analysis.

Open Data

DataHub (http://datahub.io/dataset)

World Health Organization (http://www.who.int/research/en/)

Data.gov (http://data.gov)

European Union Open Data Portal (http://open-data.europa.eu/en/data/)

Amazon Web Service public datasets (http://aws.amazon.com/datasets)

Facebook Graph (http://developers.facebook.com/docs/graph-api)

Healthdata.gov (http://www.healthdata.gov)

Google Trends (http://www.google.com/trends/explore)

Google Finance (https://www.google.com/finance)

Google Books Ngrams (http://storage.googleapis.com/books/ngrams/books/datasetsv2.html)

Machine Learning Repository (http://archive.ics.uci.edu/ml/)

Friday, February 12, 2016

Master R 14 - Analyzing the R community

R Foundation members

library(XML)
page <- htmlParse('http://r-project.org/foundation/donors.html')
list <- unlist(xpathApply(page, "//h3[@id='supporting-members']/following-sibling::ul[1]/li", xmlValue))
str(list)

supporterlist <- sub(' \\([a-zA-Z ]*\\)$', '', list)
countrylist   <- substr(list, nchar(supporterlist) + 3, nchar(list) - 1)
tail(sort(prop.table(table(countrylist)) * 100), 5)

Visualizing supporting members around the world

countries <- as.data.frame(table(countrylist))
library(rworldmap)
joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist', verbose = TRUE)

library(ggmap)
for (fix in c('Brasil', 'CZ', 'Danmark', 'NL')) {
   countrylist[which(countrylist == fix)] <-
       geocode(fix, output = 'more')$country
}

countries <- as.data.frame(table(countrylist))
countries <- joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist')
mapCountryData(countries, 'Freq', catMethod = 'logFixedWidth', mapTitle = 'Number of R Foundation supporting members')


R package maintainers

packages <- readHTMLTable(paste0('http://cran.r-project.org', '/web/checks/check_summary.html'), which = 2)
maintainers <- sub('(.*) <(.*)>', '\\1', packages$' Maintainer')
maintainers <- gsub(' ', ' ', maintainers)
str(maintainers)
tail(sort(table(maintainers)), 8)


The number of packages per maintainer

N <- as.numeric(table(maintainers))
library(fitdistrplus)
plotdist(N)
descdist(N, boot = 1e3)
(gparams <- fitdist(N, 'gamma'))
gshape <- gparams$estimate[['shape']]
grate  <- gparams$estimate[['rate']]
sum(rgamma(1e5, shape = gshape, rate = grate))
hist(rgamma(1e5, shape = gshape, rate = grate))
pgamma(2, shape = gshape, rate = grate)
prop.table(table(N <= 2))
ploc <- min(N)
pshp <- length(N) / sum(log(N) - log(ploc))

library(actuar)
ppareto(2, pshp, ploc)
fg <- fitdist(N, 'gamma')
fw <- fitdist(N, 'weibull')
fl <- fitdist(N, 'lnorm')
fp <- fitdist(N, 'pareto', start = list(shape = 1, scale = 1))
par(mfrow = c(1, 2))
denscomp(list(fg, fw, fl, fp), addlegend = FALSE)
qqcomp(list(fg, fw, fl, fp),  legendtext = c('gamma', 'Weibull', 'Lognormal', 'Pareto')) 
length(unique(maintainers))


The R-help mailing list

library(RCurl)
url <- getURL('https://stat.ethz.ch/pipermail/r-help/')
R.help.toc <- htmlParse(url)
R.help.archives <- unlist(xpathApply(R.help.toc, "//table//td[3]/a", xmlAttrs), use.names = FALSE)
dir.create('r-help')
for (f in R.help.archives)
     download.file(url = paste0(url, f), file.path('help-r', f), method = 'curl'))
lines <- system(paste0("zgrep -E '^From: .* at .*' ./help-r/*.txt.gz"), intern = TRUE)
length(lines)
length(unique(lines))
lines[26]
lines    <- sub('.*From: ', '', lines)
Rhelpers <- sub('.*\\((.*)\\)', '\\1', lines)
tail(sort(table(Rhelpers)), 6)
grep('Brian( D)? Ripley', names(table(Rhelpers)), value = TRUE)

Volume of the R-help mailing list

lines <- system(paste0(
"zgrep -E '^Date: [A-Za-z]{3}, [0-9]{1,2} [A-Za-z]{3} ",
"[0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2} [-+]{1}[0-9]{4}' ",
"./help-r/*.txt.gz"), intern = TRUE)
length(lines)
head(sub('.*Date: ', '', lines[1]))
times <- strptime(sub('.*Date: ', '', lines), format = '%a, %d %b %Y %H:%M:%S %z')
plot(table(format(times, '%Y')), type = 'l')

library(data.table)
Rhelp <- data.table(time = times)
Rhelp[, H := hour(time)]
Rhelp[, D := wday(time)]
library(ggplot2)
ggplot(na.omit(Rhelp[, .N, by = .(H, D)]),
     aes(x = factor(H), y = factor(D), size = N)) + geom_point() +
     ylab('Day of the week') + xlab('Hour of the day') +
     ggtitle('Number of mails posted on [R-help]') +
     theme_bw() + theme('legend.position' = 'top')
tail(sort(table(sub('.*([+-][0-9]{4}).*', '\\1', lines))), 22)

Forecasting the e-mail volume in the future

Rhelp[, date := as.Date(time)]
Rdaily <- na.omit(Rhelp[, .N, by = date])
Rdaily <- zoo(Rdaily$N, Rdaily$date)
plot(Rdaily)

library(forecast)
fit <- ets(Rdaily)
predict(fit, 1)
plot(forecast(fit, 30), include = 365)


Analyzing overlaps between our lists of R users

lists <- rbindlist(list(
data.frame(name = unique(supporterlist), list = 'supporter'),
data.frame(name = unique(maintainers),   list = 'maintainer'),
data.frame(name = unique(Rhelpers),      list = 'R-help')))

t <- table(lists$name, lists$list)
table(rowSums(t))
library(Rcapture)
descriptive(t)
closedp(t)

Further ideas on extending the capture-recapture models


The number of R users in social media

library(fbRads)
fbad_init(FB account ID, FB API token)
fbad_get_search(q = 'rstats', type = 'adinterest')
fbad_get_search(fbacc = fbacc, q = 'SPSS', type = 'adinterest')
res <- fbad_get_search(fbacc = fbacc, q = 'programming language', type = 'adinterest')
res <- res[order(res$audience_size, decreasing = TRUE), ]
res[1:10, 1:3]


R-related posts in social media

library(twitteR)
setup_twitter_oauth(...)
str(searchTwitter("#rstats", n = 1, resultType = 'recent'))
tweets <- Rtweets(n = 500)
length(strip_retweets(tweets))
tweets <- twListToDF(tweets)

library(tm)
corpus <- Corpus(VectorSource(tweets$text))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, 'rstats')


library(wordcloud)
wordcloud(corpus)

Wednesday, February 10, 2016

Master R 13 - Data Around Us

Geocoding

library(hflights)

library(data.table)

dt <- data.table(hflights)[, list(

N = .N,

Cancelled = sum(Cancelled),

Distance = Distance[1],

TimeVar = sd(ActualElapsedTime, na.rm = TRUE),

ArrDelay = mean(ArrDelay, na.rm = TRUE)) , by = Dest]

str(dt)

library(ggmap)

(h <- geocode('Houston, TX'))

dt[, c('lon', 'lat') := geocode(Dest)]

str(setDF(dt))

Visualizing point data in space

library(maps)

map('state')

title('Flight destinations from Houston,TX')

points(h$lon, h$lat, col = 'blue', pch = 13)

points(dt$lon, dt$lat, col = 'red', pch = 19)

text(dt$lon, dt$lat + 1, labels = dt$Dest, cex = 0.7)

map('state')

title('Frequent flight destinations from Houston,TX')

points(h$lon, h$lat, col = 'blue', pch = 13)

points(dt$lon, dt$lat, pch = 19, col = rgb(1, 0, 0, dt$N / max(dt$N)))

legend('bottomright', legend = round(quantile(dt$N)), pch = 19, col = rgb(1, 0, 0, quantile(dt$N) / max(dt$N)), box.col = NA)

Finding polygon overlays of point data

str(map_data <- map('state', plot = FALSE, fill = TRUE))

grep('^washington', map_data$names, value = TRUE)

states <- sapply(strsplit(map_data$names, ':'), '[[', 1)

library(maptools)

us <- map2SpatialPolygons(map_data, IDs = states, proj4string = CRS("+proj=longlat +datum=WGS84"))

plot(us)

library(sp)

dtp <- SpatialPointsDataFrame(dt[, c('lon', 'lat')], dt, proj4string = CRS("+proj=longlat +datum=WGS84"))

str(sp::over(us, dtp))

str(sapply(sp::over(us, dtp, returnList = TRUE), function(x) sum(x$Cancelled)))

val <- cancels$Cancelled[match(states, row.names(cancels))]

val[is.na(val)] <- 0

Plotting thematic maps

map("state", col = rgb(1, 0, 0, sqrt(val/max(val))), fill = TRUE)

title('Number of cancelled flights from Houston to US states')

points(h$lon, h$lat, col = 'blue', pch = 13)

legend('bottomright', legend = round(quantile(val)), fill = rgb(1, 0, 0, sqrt(quantile(val)/max(val))), box.col = NA)

Rendering polygons around points

Contour lines

library(fields)

out <- as.image(dt$ArrDelay, x = dt[, c('lon', 'lat')], nrow = 256, ncol = 256)

table(is.na(out$z))

image(out)

image(out, xlim = base::range(map_data$x, na.rm = TRUE), ylim = base::range(map_data$y, na.rm = TRUE))

look <- image.smooth(out, theta = .5)

table(is.na(look$z))

image(look)

usa_data <- map('usa', plot = FALSE, region = 'main')

p <- expand.grid(look$x, look$y)

library(sp)

n <- which(point.in.polygon(p$Var1, p$Var2, usa_data$x, usa_data$y) == 0)

look$z[n] <- NA

map("usa")

image(look, add = TRUE)

map("state", lwd = 3, add = TRUE)

title('Arrival delays of flights from Houston')

points(dt$lon, dt$lat, pch = 19, cex = .5)

points(h$lon, h$lat, pch = 13)

Voronoi diagrams

library(deldir)

map("usa")

plot(deldir(dt$lon, dt$lat), wlines = "tess", lwd = 2, pch = 19, col = c('red', 'darkgray'), add = TRUE)

Satellite maps

library(OpenStreetMap)

map <- openmap(c(max(map_data$y, na.rm = TRUE),

min(map_data$x, na.rm = TRUE)),

c(min(map_data$y, na.rm = TRUE),

max(map_data$x, na.rm = TRUE)),

type = 'stamen-terrain')

map <- openproj(map, projection = '+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs')

plot(map)

plot(deldir(dt$lon, dt$lat), wlines = "tess", lwd = 2, col = c('red', 'black'), pch = 19, cex = 0.5, add = TRUE)

Interactive maps

Querying Google Maps

cancels$state <- rownames(cancels)

cancels$Cancelled[is.na(cancels$Cancelled)] <- 0

library(googleVis)

plot(gvisGeoChart(cancels, 'state', 'Cancelled',

options = list(

region = 'US',

displayMode = 'regions',

resolution = 'provinces')))

dt$LatLong <- paste(dt$lat, dt$lon, sep = ':')

dt$tip <- apply(dt, 1, function(x) paste(names(dt), x, collapse = '<br/ >'))

plot(gvisMap(dt, 'LatLong', tipvar = 'tip'))

JavaScript mapping libraries

devtools::install_github('ramnathv/rCharts')

library(rCharts)

map <- Leaflet$new()

map$setView(as.numeric(dt[which(dt$Dest == 'MCI'),

c('lat', 'lon')]), zoom = 4)

for (i in 1:nrow(dt))

map$marker(c(dt$lat[i], dt$lon[i]), bindPopup = dt$tip[i])

map$show()

library(leaflet)

leaflet(us) %>%

addProviderTiles("Acetate.terrain") %>%

addPolygons() %>%

addMarkers(lng = dt$lon, lat = dt$lat, popup = dt$tip)

Alternative map designs

dt <- dt[point.in.polygon(dt$lon, dt$lat, usa_data$x, usa_data$y) == 1, ]

library(diagram)

library(scales)

map("usa")

title('Number of flights, cancellations and delays from Houston')

image(look, add = TRUE)

map("state", lwd = 3, add = TRUE)

for (i in 1:nrow(dt)) {

curvedarrow(

from = rev(as.numeric(h)),

to = as.numeric(dt[i, c('lon', 'lat')]),

arr.pos = 1,

arr.type = 'circle',

curve = 0.1,

arr.col = alpha('black', dt$N[i] / max(dt$N)),

arr.length = dt$N[i] / max(dt$N),

lwd = dt$Cancelled[i] / max(dt$Cancelled) * 25,

lcol = alpha('black',

dt$Cancelled[i] / max(dt$Cancelled)))

}

Spatial statistics

dm <- dist(dt[, c('lon', 'lat')])

dm <- as.matrix(dm)

idm <- 1 / dm

diag(idm) <- 0

str(idm)

dt$TimeVar[is.na(dt$TimeVar)] <- 0

library(ape)

Moran.I(dt$TimeVar, idm)

library(spdep)

idml <- mat2listw(idm)

moran.test(dt$TimeVar, idml)

idml <- mat2listw(idm, style = "W")

moran.test(dt$TimeVar, idml)

Master R 12 - Analyzing Time-series

Creating time-series objects

library(hflights)

library(data.table)

dt <- data.table(hflights)

dt[, date := ISOdate(Year, Month, DayofMonth)]

daily <- dt[, list(

N = .N,

Delays = sum(ArrDelay, na.rm = TRUE),

Cancelled = sum(Cancelled),

Distance = mean(Distance)

), by = date]

str(daily)

Visualizing time-series

plot(ts(daily))

setorder(daily, date)

plot(ts(daily))

plot(ts(daily$N, start = 2011, frequency = 365), main = 'Number of flights from Houston in 2011')

Seasonal decomposition

plot(decompose(ts(daily$N, frequency = 7)))

setNames(decompose(ts(daily$N, frequency = 7))$figure, weekdays(daily$date[1:7]))

decompose(ts(daily$N, frequency = 365))

Holt-Winters filtering

nts <- ts(daily$N, frequency = 7)

fit <- HoltWinters(nts, beta = FALSE, gamma = FALSE)

plot(fit)

fit <- HoltWinters(nts)

plot(fit)

library(forecast)

forecast(fit)

plot(forecast(HoltWinters(nts), 31))

Autoregressive Integrated Moving Average models

auto.arima(nts)

auto.arima(nts, approximation = FALSE)

plot(forecast(auto.arima(nts, D = 1, approximation = FALSE), 31))

Outlier detection

cts <- ts(daily$Cancelled)

fit <- auto.arima(cts)

auto.arima(cts)

library(tsoutliers)

outliers <- tso(cts, tsmethod = 'arima',

args.tsmethod = list(order = c(1, 1, 2)))

plot(outliers)

plot(tso(ts(daily$Cancelled)))

dfc <- as.data.frame(daily[, c('date', 'Cancelled'), with = FALSE])

library(AnomalyDetection)

AnomalyDetectionTs(dfc, plot = TRUE)$plot

More complex time-series objects

library(zoo)

zd <- zoo(daily[, -1, with = FALSE], daily[[1]])

plot(zd)

plot(cumsum(zd))

Tuesday, February 9, 2016

Master R 11 - Social Network analysis of the R Ecosystem

Loading network data

library(tools)

pkgs <- available.packages()

str(pkgs)

head(package.dependencies(pkgs), 2)

library(plyr)

edges <- ldply(

c('Depends', 'Imports', 'Suggests'), function(depLevel) {

deps <- package.dependencies(pkgs, depLevel = depLevel)

ldply(names(deps), function(pkg)

if (!identical(deps[[pkg]], NA))

data.frame(

src = pkg,

dep = deps[[pkg]][, 1],

label = depLevel,

stringsAsFactors = FALSE))

})

str(edges)

Centrality measures of networks

compute centrality and density metrics

identify bridges and simulate changes in the network

nrow(edges) / (nrow(pkgs) * (nrow(pkgs) - 1))

head(sort(table(edges$dep), decreasing = TRUE))

edges <- edges[edges$dep != 'R', ]

library(igraph)

g <- graph.data.frame(edges)

summary(g)

graph.density(g)

head(sort(degree(g), decreasing = TRUE))

head(sort(betweenness(g), decreasing = TRUE))

Visualizing network data

plot(degree(g), betweenness(g), type = 'n', main = 'Centrality of R package dependencies')

text(degree(g), betweenness(g), labels = V(g)$name)

edges <- edges[edges$label != 'Suggests', ]

deptree <- edges$dep[edges$src == 'igraph']

while (!all(edges$dep[edges$src %in% deptree] %in% deptree))

deptree <- union(deptree, edges$dep[edges$src %in% deptree])

deptree

g <- graph.data.frame(edges[edges$src %in% c('igraph', deptree), ])

plot(g)

V(g)$label.color <- 'orange'

V(g)$label.color[V(g)$name == 'igraph'] <- 'darkred'

V(g)$label.color[V(g)$name %in% edges$dep[edges$src == 'igraph']] <- 'orangered'

E(g)$color <- c('blue', 'green')[factor(df$label)]

plot(g, vertex.shape = 'none', edge.label = NA)

tkplot(g, edge.label = NA)

library(visNetwork)

nodes <- get.data.frame(g, 'vertices')

names(nodes) <- c('id', 'color')

edges <- get.data.frame(g)

visNetwork(nodes, edges)

Custom plot layouts

g <- dominator.tree(g, root = "igraph")$domtree

plot(g, layout = layout.reingold.tilford(g, root = "igraph"), vertex.shape = 'none')

Analyzing R package dependencies with an R package

library(miniCRAN)

pkgs <- pkgAvail()

pkgDep('igraph', availPkgs = pkgs, suggests = FALSE, includeBasePkgs = TRUE)

plot(makeDepGraph('igraph', pkgs, suggests = FALSE, includeBasePkgs = TRUE))

Monday, February 8, 2016

Master R 10 - Classification and Clustering

Cluster analysis

# Hierarchical clustering

d <- dist(mtcars)
h <- hclust(d)
h
plot(h)
rect.hclust(h, k=3, border = "red")
cn <- cutree(h, k=3)
table(cn)
round(aggregate(mtcars, FUN = mean, by = list(cn)), 1)
round(aggregate(mtcars, FUN = sd, by = list(cn)), 1)
round(sapply(mtcars, sd), 1)
round(apply(aggregate(mtcars, FUN = mean, by = list(cn)),2, sd), 1)

# Determining the ideal number of clusters

install.packages('NbClust')
library(NbClust)
NbClust(mtcars, method = 'complete', index = 'dindex')
NbClust(mtcars, method = 'complete', index = 'hartigan')$Best.nc
NbClust(mtcars, method = 'complete', index = 'kl')$Best.nc
NbClust(iris[, -5], method = 'complete', index = 'all')$Best.nc[1,]

# K-means clustering

(k <- kmeans(mtcars, 3))
all(cn == k$cluster)

# Visualizing clusters
library(cluster) 
clusplot(mtcars, k$cluster, color = TRUE, shade = TRUE, labels = 2)


# Latent class models
factors <- c('cyl', 'vs', 'am', 'carb', 'gear')
mtcars[, factors] <- lapply(mtcars[, factors], factor)

# Latent Class Analysis
install.packages('poLCA')
library(poLCA)
p <- poLCA(cbind(cyl, vs, am, carb, gear) ~ 1,data = mtcars, graphs = TRUE, nclass = 3)
p$P

# Latent class regression
# Discriminant analysis
rm(mtcars)
mtcars$gear <- factor(mtcars$gear)
library(MASS)
d <- lda(gear ~ ., data = mtcars, CV =TRUE)
(tab <- table(mtcars$gear, d$class))
sum(diag(tab)) / sum(tab)
round(d$posterior, 4)
d <- lda(gear ~ ., data = mtcars)
plot(d)
plot(d, dimen = 1, type = "both" )


# Logistic regression
lr <- glm(am ~ hp + wt, data = mtcars, family = binomial)
summary(lr)
table(mtcars$am, round(predict(lr, type = 'response')))

install.packages('nnet')
library(nnet) 
(mlr <- multinom(factor(gear) ~ ., data = mtcars)) 
table(mtcars$gear, predict(mlr))
rm(mtcars)


# Machine learning algorithms
# The K-Nearest Neighbors algorithm

#K-Nearest Neighbors is a supervised classification algorithm, which is a mostly used in pattern recognition and business analytics. A big advantage of k-NN is that it is not sensitive to outliers, and the usage is extremely straightforward.

set.seed(42)
n     <- nrow(mtcars)
train <- mtcars[sample(n, n/2), ]

library(dplyr)
train <- sample_n(mtcars, n / 2)
test <- mtcars[setdiff(row.names(mtcars), row.names(train)), ]
library(class)
(cm <- knn(
  train = subset(train, select = -gear),
  test  = subset(test, select = -gear),
  cl    = train$gear,
  k     = 5))
cor(test$gear, as.numeric(as.character(cm)))
table(train$gear)

# Classification trees
library(rpart)
ct <- rpart(factor(gear) ~ ., data = train, minsplit = 3)
summary(ct)
plot(ct)
text(ct)
table(test$gear, predict(ct, newdata = test, type = 'class'))

install.packages('party')
library(party)
ct <- ctree(factor(gear) ~ drat, data = train, controls = ctree_control(minsplit = 3)) 
plot(ct, main = "Conditional Inference Tree")
table(test$gear, predict(ct, newdata = test, type = 'node'))

install.packages('randomForest')
library(randomForest)
(rf <- randomForest(factor(gear) ~ ., data = train, ntree = 250))
table(test$gear, predict(rf, test))   
plot(rf)
legend('topright', legend = colnames(rf$err.rate), col    = 1:4, fill   = 1:4, bty    = 'n')

# Other algorithms
install.packages(c('caret','c50'))
library(caret)
library(C50)
C50 <- train(factor(gear) ~ ., data = train, method = 'C5.0')
summary(C50)
table(test$gear, predict(C50, test))

Saturday, February 6, 2016

Master R 9 - From Big to Small Data

Adequacy tests

Normality

library(hlfights)

JFK <- hflights[which(hflights$Dest == 'JFK'), c('TaxiIn', 'TaxiOut')]

JFK <- subset(hflights, Dest == 'JFK', select = c(TaxiIn, TaxiOut))

par(mfrow = c(1, 2))

qqnorm(JFK$TaxiIn, ylab = 'TaxiIn')

qqline(JFK$TaxiIn)

qqnorm(JFK$TaxiOut, ylab = 'TaxiOut')

qqline(JFK$TaxiOut)

shapiro.test(JFK$TaxiIn)

Multivariate normality

JFK <- na.omit(JFK)

library(MVN)

mardiaTest(JFK)

hzTest(JFK)

roystonTest(JFK)

par(mfrow = c(1, 2))

mvnPlot(mvt, type = "contour", default = TRUE)

mvnPlot(mvt, type = "persp", default = TRUE)

set.seed(42)

mvt <- roystonTest(MASS::mvrnorm(100, mu = c(0, 0), Sigma = matrix(c(10, 3, 3, 2), 2)))

mvnPlot(mvt, type = "contour", default = TRUE)

mvnPlot(mvt, type = "persp", default = TRUE)

Dependence of variables

hflights_numeric <- hflights[, which(sapply(hflights, is.numeric))]

cor(hflights_numeric, use = "pairwise.complete.obs")

str(cor(hflights_numeric, use = "pairwise.complete.obs"))

hflights_numeric <- hflights[,which(sapply(hflights, function(x) is.numeric(x) && var(x, na.rm = TRUE) != 0))]

table(is.na(cor(hflights_numeric, use = "pairwise.complete.obs")))

library(ellipse)

plotcorr(cor(hflights_numeric, use = "pairwise.complete.obs"))

plotcorr(cor(data.frame(

1:10,

1:10 + runif(10),

1:10 + runif(10) * 5,

runif(10),

10:1,

check.names = FALSE)))

cor(hflights$FlightNum, hflights$Month)

KMO and Barlett's test

library(psych)

KMO(cor(hflights_numeric, use = "pairwise.complete.obs"))

cor(hflights_numeric[, c('Cancelled', 'AirTime')])

cancelled <- which(hflights_numeric$Cancelled == 1)

table(hflights_numeric$AirTime[cancelled], exclude = NULL)

table(hflights_numeric$Cancelled)

hflights_numeric <- subset(hflights_numeric, select = -Cancelled)

which(is.na(cor(hflights_numeric, use = "pairwise.complete.obs")), arr.ind = TRUE)

hflights_numeric <- subset(hflights_numeric, select = -Diverted)

KMO(cor(hflights_numeric[, -c(14)], use = "pairwise.complete.obs"))

KMO(mtcars)

cortest.bartlett(cor(mtcars))

Principal Component Analysis

PCA algorithms

prcomp(mtcars, scale = TRUE)

summary(prcomp(mtcars, scale = TRUE))

sum(prcomp(scale(mtcars))$sdev^2)

Determining the number of components

prcomp(scale(mtcars))$sdev^2

VSS.scree(cor(mtcars))

scree(cor(mtcars))

fa.parallel(mtcars)

Interpreting components

pc <- prcomp(mtcars, scale = TRUE)

head(pc$x[, 1:2])

head(scale(mtcars) %*% pc$rotation[, 1:2])

summary(pc$x[, 1:2])

apply(pc$x[, 1:2], 2, sd)

round(cor(pc$x))

pc$rotation[, 1:2]

biplot(pc, cex = c(0.8, 1.2))

abline(h = 0, v = 0, lty = 'dashed')

cor(mtcars, pc$x[, 1:2])

Rotation methods

varimax(pc$rotation[, 1:2])

pcv <- varimax(pc$rotation[, 1:2])$loadings

plot(scale(mtcars) %*% pcv, type = 'n', xlab = 'Transmission', ylab = 'Power')

text(scale(mtcars) %*% pcv, labels = rownames(mtcars))

library(GPArotation)

promax(pc$rotation[, 1:2])

Outlier-detection with PCA

library(jpeg)

t <- tempfile()

download.file('http://bit.ly/nasa-img', t)

img <- readJPEG(t)

str(img)

h <- dim(img)[1]

w <- dim(img)[2]

m <- matrix(img, h*w)

str(m)

pca <- prcomp(m)

summary(pca)

pca$rotation

extractColors <- function(x) rgb(x[1], x[2], x[3])

(colors <- apply(abs(pca$rotation), 2, extractColors))

pie(pca$sdev, col = colors, labels = colors)

par(mfrow = c(1, 2), mar = rep(0, 4))

image(matrix(pca$x[, 1], h), col = gray.colors(100))

image(matrix(pca$x[, 2], h), col = gray.colors(100), yaxt = 'n')

Factor analysis

m <- subset(mtcars, select = c(mpg, cyl, hp, carb))

(f <- fa(m))

fa.diagram(f)

cor(f$scores, mtcars$disp)

Principal Component Analysis versus Factor Analysis

PCA is used to reduce the number of variables by creating principal components that then can be used in further projects instead of the original variables. This means that we try to extract the essence of the dataset in the means of artificially created variables, which best describe the variance of the data.

FA is the other way around, as it tries to identify unknown, latent variables to explain the original data. In plain English, we use the manifest variables from our empirical dataset to guess the internal structure of the data.

Multidimensional Scaling

as.matrix(eurodist)[1:5, 1:5]

(mds <- cmdscale(eurodist))

plot(mds)

plot(mds, type = 'n')

text(mds[, 1], mds[, 2], labels(eurodist))

mds <- cmdscale(dist(mtcars))

plot(mds, type = 'n')

text(mds[, 1], mds[, 2], rownames(mds))

Thursday, February 4, 2016

Master R 8 - Polishing data

The types and origins of missing data

Missing Completely at Random (MCAR) means that every value in the dataset has the same probability of being missed,

so no systematic error or distortion is to be expected due to missing data, and nor can we explain the pattern of missing values.

Missing at Random (MAR) is known or at least can be identified, although it has nothing to do with the actual missing values.

For Missing Not at Random (MNAR), where data is missing for a specific reason that is highly related to the actual question,

which classifies missing values as nonignorable non-response.

Identifying missing data

library(hflights)

table(complete.cases(hflights))

prop.table(table(complete.cases(hflights))) * 100

sort(sapply(hflights, function(x) sum(is.na(x))))

By-passing missing values

mean(cor(apply(hflights, 2, function(x) as.numeric(is.na(x)))), na.rm = TRUE)

Funs <- Filter(is.function, sapply(ls(baseenv()), get, baseenv()))

names(Filter(function(x) any(names(formals(args(x))) %in% 'na.rm'), Funs))

names(Filter(function(x) any(names(formals(args(x))) %in% 'na.rm'), Filter(is.function,sapply(ls('package:stats'), get, 'package:stats'))))

Overriding the default arguments of a function

myMean <- function(...) mean(..., na.rm = TRUE)

mean(c(1:5, NA))

myMean(c(1:5, NA))

library(rapportools)

mean(c(1:5, NA))

detach('package:rapportools')

mean(c(1:5, NA))

library(Defaults)

setDefaults(mean.default, na.rm = TRUE)

mean(c(1:5, NA))

setDefaults(mean, na.rm = TRUE)

mean

Getting rid of missing data

na.omit(c(1:5, NA))

na.exclude(c(1:5, NA))

x <- rnorm(10); y <- rnorm(10)

x[1] <- NA; y[2] <- NA

exclude <- lm(y ~ x, na.action = "na.exclude")

omit <- lm(y ~ x, na.action = "na.omit")

residuals(exclude)

m <- matrix(1:9, 3)

m[which(m %% 4 == 0, arr.ind = TRUE)] <- NA

na.omit(m)

Filtering missing data before or during the actual analysis

mean(hflights$ActualElapsedTime)

mean(hflights$ActualElapsedTime, na.rm = TRUE)

mean(na.omit(hflights$ActualElapsedTime))

library(microbenchmark)

NA.RM <- function() mean(hflights$ActualElapsedTime, na.rm = TRUE)

NA.OMIT <- function() mean(na.omit(hflights$ActualElapsedTime))

microbenchmark(NA.RM(), NA.OMIT())

Data imputation

m[which(is.na(m), arr.ind = TRUE)] <- 0

ActualElapsedTime <- hflights$ActualElapsedTime

mean(ActualElapsedTime, na.rm = TRUE)

ActualElapsedTime[which(is.na(ActualElapsedTime))] <- mean(ActualElapsedTime, na.rm = TRUE)

mean(ActualElapsedTime)

library(Hmisc)

mean(impute(hflights$ActualElapsedTime, mean))

sd(hflights$ActualElapsedTime, na.rm = TRUE)

sd(ActualElapsedTime)

summary(iris)

library(missForest)

set.seed(81)

miris <- prodNA(iris, noNA = 0.2)

summary(miris)

iiris <- missForest(miris, xtrue = iris, verbose = TRUE)

str(iiris)

Comparing different imputation methods

miris <- miris[, 1:4]

iris_mean <- impute(miris, fun = mean)

iris_forest <- missForest(miris)

diag(cor(iris[, -5], iris_mean))

diag(cor(iris[, -5], iris_forest$ximp))

Not imputing missing values

Multiple imputation

Extreme values and outliers

detach('package:missForest')

detach('package:randomForest')

library(outliers)

outlier(hflights$DepDelay)

summary(hflights$DepDelay)

library(lattice)

bwplot(hflights$DepDelay)

IQR(hflights$DepDelay, na.rm = TRUE)

Testing extreme values

set.seed(83)

dixon.test(c(runif(10), pi))

model <- lm(hflights$DepDelay ~ 1)

model$coefficients

mean(hflights$DepDelay, na.rm = TRUE)

a <- 0.1

(n <- length(hflights$DepDelay))

(F <- qf(1 - (a/n), 1, n-2, lower.tail = TRUE))

(L <- ((n - 1) * F / (n - 2 + F))^0.5)

sum(abs(rstandard(model)) > L)

Using robust method

summary(lm(Sepal.Length ~ Petal.Length, data = miris))

lm(Sepal.Length ~ Petal.Length, data = iris)$coefficients

library(MASS)

summary(rlm(Sepal.Length ~ Petal.Length, data = miris))

f <- formula(Sepal.Length ~ Petal.Length)

cbind(orig = lm(f, data = iris)$coefficients, lm = lm(f, data = miris)$coefficients, rlm = rlm(f, data = miris)$coefficients)

miris$Sepal.Length[1] <- 14

cbind(orig = lm(f, data = iris)$coefficients, lm = lm(f, data = miris)$coefficients, rlm = rlm(f, data = miris)$coefficients)

Wednesday, February 3, 2016

Master R 7 - Unstructured Data

library(tm)
getSources()
getReaders()

# Importing the corpus

res <- XML::readHTMLTable(paste0('http://cran.r-project.org/', 'web/packages/available_packages_by_name.html'), which = 1)

head(res)
v <- Corpus(VectorSource(res$V2)); v
inspect(head(v, 3))
meta(v[[1]])
writeLines(as.character(v[[1]]))
lapply(v[1:5], as.character)

# Cleaning the corpus
getTransformations()
stopwords("english")
removeWords('to be or not to be', stopwords("english"))
v <- tm_map(v, removeWords, stopwords("english"))
inspect(head(v, 3))

v <- tm_map(v, content_transformer(tolower))
v <- tm_map(v, removePunctuation)
v <- tm_map(v, stripWhitespace)
inspect(head(v, 3))

# Visualizing the most frequent words in the corpus
wordcloud::wordcloud(v)

# Further cleanup
v <- tm_map(v, removeNumbers)
tdm <- TermDocumentMatrix(v)
inspect(tdm[1:5, 1:20])
findFreqTerms(tdm, lowfreq = 100)

myStopwords <- c('package', 'based', 'using')
v <- tm_map(v, removeWords, myStopwords)

# Stemming words
library(SnowballC)
wordStem(c('cats', 'mastering', 'modelling', 'models', 'model'))
wordStem(c('are', 'analyst', 'analyze', 'analysis'))
d <- v
v <- tm_map(v, stemDocument, language = "english")
v <- tm_map(v, content_transformer(function(x, d) {
paste(stemCompletion(strsplit(stemDocument(x), ' ')[[1]],d),collapse = ' ')
}), d)

tdm <- TermDocumentMatrix(v)
findFreqTerms(tdm, lowfreq = 100)


# Lemmatisation
Remove characters from the end of words in the hope of finding the stem, which is heuristic process often resulting in not-existing words. We tried to overcome this issue by comleting the stems to the shortest meaningful words by using a dictionary, which might result in derivation in the meaning of the term.
Another way to reduce the number of inflectional forms of different terms, instead of deconstruction and then trying to rebuidl the words, is morphological analysis with the help of a dictionary. This process is called lemmatisation, which looks for lemma (the canonical form of a word) instead of stems.

# Analyzing the associations among terms
findAssocs(tdm, 'data', 0.1)

# Some other metrics
vnchar <- sapply(v, function(x) nchar(x$content))
summary(vnchar)
(vm <- which.min(vnchar))
v[[vm]]
hist(vnchar, main = 'Length of R package descriptions', xlab = 'Number of characters')

# The segmentation of documents
hadleyverse <- c('ggplot2', 'dplyr', 'reshape2', 'lubridate','stringr', 'devtools', 'roxygen2', 'tidyr')
(w <- which(res$V1 %in% hadleyverse))
plot(hclust(dist(DocumentTermMatrix(v[w]))), xlab = 'Hadleyverse packages')
sapply(v[w], function(x) structure(content(x), .Names = meta(x, 'id')))

Tuesday, February 2, 2016

Master R 6 - Beyond the linear trend line

The modeling workflow
1 First, fit the model with the main predictors and all the relevant confounders, and then reduce the number of confounders by dropping out the non-significant ones. 
2 Decide whether to use the continuous variables in their original or categorized form.
3 Try to achieve a better fit by testing for non-linear relationships, if they are pragmaticaly relevant.
4 Finally check the model assumptions.

Logistic regression
library(catdata)
data(deathpenalty)
library(vcdExtra)
deathpenalty.expand <- expand.dft(deathpenalty)
binom.model.0 <- glm(DeathPenalty ~ DefendantRace, data = deathpenalty.expand, family = binomial)
summary(binom.model.0)
exp(cbind(OR = coef(binom.model.0), confint(binom.model.0)))

binom.model.1 <- update(binom.model.0, . ~ . + VictimRace)
summary(binom.model.1)
exp(cbind(OR = coef(binom.model.1), confint(binom.model.1)))

prop.table(table(factor(deathpenalty.expand$VictimRace,labels = c("VictimRace=0", "VictimRace=1")),factor(deathpenalty.expand$DefendantRace, labels = c("DefendantRace=0", "DefendantRace=1"))), 1)

Data consideration
A general rule of thumb, logistic regression models require at least 10 events per predictors.

Goodness of model fit
library(lmtest)
lrtest(binom.model.1)
library(BaylorEdPsych)
PseudoR2(binom.model.1)

Model comparison
lrtest(binom.model.0, binom.model.1)

Models for count data
Poisson regression
dfa <- readRDS('SMART_2013.RData')
(ct <- xtabs(~model+failure, data=dfa))
dfa <- dfa[dfa$model %in% names(which(rowSums(ct) - ct[, 1] > 0)),]

library(ggplot2)
ggplot(rbind(dfa, data.frame(model='All', dfa[, -1] )), aes(failure)) + ylab("log(count)") + 
geom_histogram(binwidth = 1, drop=TRUE, origin = -0.5)  + 
scale_y_log10() + scale_x_continuous(breaks=c(0:10)) + 
facet_wrap( ~ model, ncol = 3) + ggtitle("Histograms by manufacturer") + theme_bw()

poiss.base <- glm(failure ~ model, offset(log(freq)),  family = 'poisson', data = dfa)
summary(poiss.base)
contrasts(dfa$model, sparse = TRUE)
lrtest(poiss.base)

Negative binomial regression
library(MASS)
model.negbin.0 <- glm.nb(failure ~ model, offset(log(freq)), data = dfa)
lrtest(poiss.base,model.negbin.0)

Multivariate non-linear models
model.negbin.1 <- update(model.negbin.0, . ~ . + capacity_bytes + age_month + temperature)
model.negbin.2 <- update(model.negbin.1, . ~ . + PendingSector)
lrtest(model.negbin.0, model.negbin.1, model.negbin.2)
summary(model.negbin.2)
exp(data.frame(exp_coef = coef(model.negbin.2)))
dfa$model <- relevel(dfa$model, 'WDC')


model.negbin.3 <- update(model.negbin.2, data = dfa)
library(broom)
format(tidy(model.negbin.3), digits = 4)

library(data.table)
dfa <- data.table(dfa)
dfa[, temp6 := cut2(temperature, g = 6)]
temperature.weighted.mean <- dfa[, .(wfailure = weighted.mean(failure, freq)), by = temp6] 
ggplot(temperature.weighted.mean, aes(x = temp6, y = wfailure)) +  
geom_bar(stat = 'identity') + xlab('Categorized temperature') + ylab('Weighted mean of disk faults') + theme_bw()

model.negbin.4 <- update(model.negbin.0, .~. + capacity_bytes + age_month + temp6 + PendingSector, data = dfa)
AIC(model.negbin.3,model.negbin.4)

weighted.means <- rbind(dfa[, .(l = 'capacity', f = weighted.mean(failure, freq)), by = .(v = capacity_bytes)], dfa[, .(l = 'age', f = weighted.mean(failure, freq)), by = .(v = age_month)])
ggplot(weighted.means, aes(x = l, y = f)) + geom_step() +
facet_grid(. ~ v, scales = 'free_x') + theme_bw() +
ylab('Weighted mean of disk faults') + xlab('')

dfa[, capacity_bytes := as.factor(capacity_bytes)]
dfa[, age8 := cut2(age_month, g = 8)]
model.negbin.5 <- update(model.negbin.0, .~. + capacity_bytes + age8 + temp6 + PendingSector, data = dfa)
AIC(model.negbin.5, model.negbin.4)
format(tidy(model.negbin.5), digits = 3)

tmnb5 <- tidy(model.negbin.5)
str(terms <- tmnb5$term[tmnb5$p.value < 0.05][-1])

library(plyr)
ci <- ldply(terms, function(t) confint(model.negbin.5, t))
names(ci) <- c('min', 'max')
ci$term <- terms
ci$variable <- sub('[A-Z0-9\\]\\[,() ]*$', '', terms, perl = TRUE)

ggplot(ci, aes(x = factor(term), color = variable)) + 
geom_errorbar(ymin = min, ymax = max) + xlab('') +
ylab('Coefficients (95% conf.int)') + theme_bw() + 
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = 'top')
PseudoR2(model.negbin.6 )

Monday, February 29, 2016

Thursday, February 25, 2016

Tuesday, February 23, 2016

Monday, February 22, 2016

Saturday, February 13, 2016

Friday, February 12, 2016

Wednesday, February 10, 2016

Tuesday, February 9, 2016

Monday, February 8, 2016

Saturday, February 6, 2016

Thursday, February 4, 2016

Wednesday, February 3, 2016

Tuesday, February 2, 2016

Blog Archive