Monday, February 29, 2016

Python Data Analysis 5 - pandas: Reading and Writing Data

I/O API Tools

-- CSV and textual Files

read_csv
read_table
to_csv

-- Reading Data in CSV or Text Files
csvframe = read_csv(ímyCSV_01.csví)
csvframe
read_table(ích05_01.csví,sep=í,í)
read_csv(ích05_02.csví)
read_csv(ích05_02.csví, header=None)
read_csv(ích05_02.csví, names=[íwhiteí,íredí,íblueí,ígreení,íanimalí])
read_csv(ích05_03.csví, index_col=[ícolorí,ístatusí])

-- Using RegExp for Parsing TXT Files
read_table(ích05_04.txtí,sep=í\s*í)
read_table(ích05_05.txtí,sep=í\D*í,header=None)
read_table(ích05_06.txtí,sep=í,í,skiprows=[0,1,3,6])

-- Reading TXT Files into Parts or Partially
read_csv(ích05_02.csví,skiprows=[2],nrows=3,header=None)
out = Series()
i=0
pieces = read_csv(ích05_01.csví,chunksize=3)
for piece in pieces:
    out.set_value(i,piece[íwhiteí].sum())
    i = i + 1

-- Writing Data in CSV
frame2
frame2.to_csv('ch05_07.csv')
frame2.to_csv(ích05_07b.csví, index=False, header=False)

frame3
frame3.to_csv(ích05_09.csví, na_rep =íNaNí)

-- Reading and Writing HTML Files
read_html
to_html

-- Writing Data in HTML
frame = pd.DataFrame(np.arange(4).reshape(2,2))
print(frame.to_html())
frame = pd.DataFrame( np.random.random((4,4)),
                    index = [íwhiteí,íblackí,íredí,íblueí],
                    columns = [íupí,ídowní,írightí,íleftí])
s = [í<HTML>í]
s.append(í<HEAD><TITLE>My DataFrame</TITLE></HEAD>í)
s.append(í<BODY>í)
s.append(frame.to_html())
s.append(í</BODY></HTML>í)
html = íí.join(s)
html_file = open(ímyFrame.htmlí,íwí)
html_file.write(html)
html_file.close()

-- Reading Data from an HTML File
web_frames = pd.read_html(ímyFrame.htmlí)
web_frames[0]
ranking = pd.read_html(íhttp://www.meccanismocomplesso.org/en/meccanismo-complesso-sito-2/classifica-punteggio/í)
ranking[0]

-- Reading Data from XML
from lxml import objectify
xml = objectify.parse(íbooks.xmlí)
xml
root = xml.getroot()
root.Book.Author
root.Book.PublishDate
root.getchildren()
[child.tag for child in root.Book.getchildren()]
[child.text for child in root.Book.getchildren()]

def etree2df(root):
    column_names = []
    for i in range(0,len(root.getchildren()[0].getchildren())):
       column_names.append(root.getchildren()[0].getchildren()[i].tag)
    xml:frame = pd.DataFrame(columns=column_names)
    for j in range(0, len(root.getchildren())):
       obj = root.getchildren()[j].getchildren()
       texts = []
       for k in range(0, len(column_names)):
          texts.append(obj[k].text)
       row = dict(zip(column_names, texts))
       row_s = pd.Series(row)
       row_s.name = j
       xml:frame = xml:frame.append(row_s)
    return xml:frame

etree2df(root)

-- Reading and Writing Data on Microsoft Excel Files
to_excel()
read_excel()
pd.read_excel(ídata.xlsí)
pd.read_excel(ídata.xlsí,íSheet2í)
pd.read_excel(ídata.xlsí,1)
frame = pd.DataFrame(np.random.random((4,4)),
                      index = [íexp1í,íexp2í,íexp3í,íexp4í],
                      columns = [íJan2015í,íFab2015í,íMar2015í,íApr2005í])
frame
frame.to_excel('data2.xlsx')

-- JSON Data

read_json
to_json
frame = pd.DataFrame(np.arange(16).reshape(4,4),
                      index=[íwhiteí,íblackí,íredí,íblueí],
                      columns=[íupí,ídowní,írightí,íleftí])
frame.to_json(íframe.jsoní)
pd.read_json('frame.json')

from pandas.io.json import json_normalize
file = open(íbooks.jsoní,írí)
text = file.read()
text = json.loads(text)
json_normalize(text,íbooksí)
json_normalize(text2,íbooksí,[íwriterí,ínationalityí])

-- The Format HDF5
from pandas.io.pytables import HDFStore
frame = pd.DataFrame(np.arange(16).reshape(4,4),
                      index=[íwhiteí,íblackí,íredí,íblueí],
                      columns=[íupí,ídowní,írightí,íleftí])
store = HDFStore(ímydata.h5í)
store[íobj1í] = frame
frame2
store[íobj2í] = frame2

store
store['obj2']

-- PickleóPython Object Serialization

-- Serialize a Python Object with cPickle

import cPickle as pickle
data = { ícolorí: [íwhiteí,íredí], ívalueí: [5, 7]}
pickled_data = pickle.dumps(data)
print pickled_data
nframe = pickle.loads(pickled_data)
nframe

-- Pickling with pandas

frame = pd.DataFrame(np.arange(16).reshape(4,4), index = [íupí,ídowní,íleftí,írightí])
frame.to_pickle(íframe.pklí)

pd.read_pickle(íframe.pklí)

-- Interacting with Databases
from sqlalchemy import create_engine

engine = create_engine(ípostgresql://scott:tiger@localhost:5432/mydatabaseí)
engine = create_engine(ímysql+mysqldb://scott:tiger@localhost/fooí)
engine = create_engine(íoracle://scott:tiger@127.0.0.1:1521/sidnameí)
engine = create_engine(ímssql+pyodbc://mydsní)
engine = create_engine(ísqlite:///foo.dbí)

-- Loading and Writing Data with SQLite3
frame = pd.DataFrame( np.arange(20).reshape(4,5), columns=[íwhiteí,íredí,íblueí,íblackí,ígreení])
frame
engine = create_engine(ísqlite:///foo.dbí)
frame.to_sql(ícolorsí,engine)
pd.read_sql(ícolorsí,engine)
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
c REAL,        d INTEGER
);"""
con = sqlite3.connect(í:memory:í)
con.execute(query)
con.commit()

data = [(íwhiteí,íupí,1,3),
         (íblackí,ídowní,2,8),
         (ígreení,íupí,4,4),
         (íredí,ídowní,5,5)]
stmt = "INSERT INTO test VALUES(?,?,?,?)"
con.executemany(stmt, data)
con.commit()

cursor = con.execute(íselect * from testí)
cursor
rows = cursor.fetchall()
rows

cursor.description
pd.DataFrame(rows, columns=zip(*cursor.description)[0])

-- Loading and Writing Data with PostgreSQL
pd.__version__
engine = create_engine(ípostgresql://postgres:password@localhost:5432/postgresí)
frame = pd.DataFrame(np.random.random((4,4)),
              index=[íexp1í,íexp2í,íexp3í,íexp4í],
              columns=[ífebí,ímarí,íaprí,ímayí]);
frame.to_sql(ídataframeí,engine)
psql -U postgres
pd.read_sql_table(ídataframeí,engine)
pd.read_sql_query(íSELECT index,apr,may FROM DATAFRAME WHERE apr > 0.5í,engine)

-- Reading and Writing Data with a NoSQL Database: MongoDB
mongod --dbpath C:\MongoDB_data
import pymongo
client = MongoClient(ílocalhostí,27017)
db = client.mydatabase
db
collection = db.mycollection
db[ímycollectioní]
collection
frame = pd.DataFrame( np.arange(20).reshape(4,5),
                       columns=[íwhiteí,íredí,íblueí,íblackí,ígreení])
frame

import json
record = json.loads(frame.T.to_json()).values()
record
collection.mydocument.insert(record)

cursor = collection[ímydocumentí].find()
dataframe = (list(cursor))
del dataframe[í_idí]
dataframe


Conclusions

Thursday, February 25, 2016

Python Data Analysis 4 - The pandas Library - An Introduction

Pandas: The Python Data Analytics Library

Installation

Installation from Anaconda

Installation on Linux

-- Test Your pandas Installation

nosetests pandas

-- Getting Started with pandas

import pandas as pd
import numpy as np
from pandas import *

-- Introduction to pandas Data Structures

-- The Series

 Declaring a Series

s = pd.Series([12,-4,7,9])
s

s = pd.Series([12,-4,7,9], index=[íaí,íbí,ící,ídí])
s
s.values
s.index

 Selecting the Internal Elements

s[2]
s['b']
s[['b', 'c']]

 Assigning Values to the Elements

s[1] = 0
s
s['b'] = 1
s

 Defining Series from NumPy Arrays and Other Series

arr = np.array([1,2,3,4])
s3 = pd.Series(arr)
s3
s4 = pd.Series(s)
s4
s3
arr[2] = -2
s3

 Filtering Values

s[s > 8]

 Operations and Mathematical Functions

s/2

np.logs(s)


 Evaluating Values

serd = pd.Series([1,0,2,1,2,3], index=[íwhiteí,íwhiteí,íblueí,ígreení,ígreení,íyellowí])
serd
serd.unique()
serd.value_counts()
serd.isin([0,3])

 NaN Values

s2 = pd.Series([5,-3,np.NaN,14])
s2
s2.isnull(?)
s2[s2.notnull(?)]

 Series as Dictionaries

mydict = {íredí: 2000, íblueí: 1000, íyellowí: 500, íorangeí: 1000}
myseries = pd.Series(mydict)
colors = [íredí,íyellowí,íorangeí,íblueí,ígreení]
myseries = pd.Series(mydict, index=colors)

 Operations between Series

mydict2 = {íredí:400,íyellowí:1000,íblackí:700}
myseries2 = pd.Series(mydict2)
myseries + myseries2

-- The DataFrame

 Defining a DataFrame

data = {ícolorí : [íblueí,ígreení,íyellowí,íredí,íwhiteí],
                     íobjectí : [íballí,ípení,ípencilí,ípaperí,ímugí],
                     ípriceí : [1.2,1.0,0.6,0.9,1.7]}
frame = pd.DataFrame(data)
frame
frame2 = pd.DataFrame(data, columns=[íobjectí,ípriceí])
frame2
frame2 = pd.DataFrame(data, index=[íoneí,ítwoí,íthreeí,ífourí,ífiveí])
frame2
frame3 = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=[íredí,íblueí,íyellowí,íwhiteí],
                   columns=[íballí,ípení,ípencilí,ípaperí]) 
frame3

 Selecting Elements

frame.columns
frame.index
frame.values
frame['price']
frame.ix[2]
frame.ix[[2,4]]
frame[0:1]
frame[1:3]
frame['object'][3]

 Assigning Values

frame.index.name = íidí; frame.columns.name = íitemí
frame
frame[ínewí] = 12
frame
frame[ínewí] = [3.0,1.3,2.2,0.8,1.1]
frame
ser = pd.Series(np.arange(5))
ser
frame[ípriceí][2] = 3.3

 Membership of a Value

frame.isin([1.0,ípení])
frame[frame.isin([1.0,ípení])]

 Deleting a Column

del frame[ínewí]
frame

 Filtering

frame[frame < 12]

 DataFrame from Nested dict

nestdict = { íredí: { 2012: 22, 2013: 33 },
                     íwhiteí: { 2011: 13, 2012: 22; 2013: 16},
                     íblueí: {2011: 17, 2012: 27; 2013: 18}}
frame2 = pd.DataFrame(nestdict)
frame2

 Transposition of a DataFrame

frame2.T

-- The Index Objects

ser = pd.Series([5,0,3,8,4], index=[íredí,íblueí,íyellowí,íwhiteí,ígreení])
ser.index

 Methods on Index

ser.idxmin()
ser.idxmax()

 Index with Duplicate Labels

serd = pd.Series(range(6), index=[íwhiteí,íwhiteí,íblueí,ígreení,ígreení,íyellowí])
serd

serd['white']
serd.index.is_unique
frame.index.is_unique

-- Other Functionalities on Indexes

ser = pd.Series([2,5,7,4], index=[íoneí,ítwoí,íthreeí,ífourí])
ser
ser.reindex([íthreeí,ífourí,ífiveí,íoneí])
ser3 = pd.Series([1,5,6,3],index=[0,3,5,6])
ser3
ser3.reindex(range(6),method=íffillí)
ser3.reindex(range(6),method=íbfillí)
frame.reindex(range(5), method=íffillí,columns=[ícolorsí,ípriceí,ínewí,íobjectí])

-- Dropping

drop()
ser = Series(np.arange(4.), index=[íredí,íblueí,íyellowí,íwhiteí])
ser
ser.drop(íyellowí)
ser.drop([íblueí,íwhiteí])

frame = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=[íredí,íblueí,íyellowí,íwhiteí],
                  columns=[íballí,ípení,ípencilí,ípaperí])
frame
frame.drop([íblueí,íyellowí])
frame.drop([ípení,ípencilí],axis=1)

-- Arithmetic and Data Alignment

s1 = pd.Series([3,2,5,1],[íwhiteí,íyellowí,ígreení,íblueí])
s2 = pd.Series([1,4,7,2,1],[íwhiteí,íyellowí,íblackí,íblueí,íbrowní])

s1+s2

frame1 = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=[íredí,íblueí,íyellowí,íwhiteí],
                   columns=[íballí,ípení,ípencilí,ípaperí])
frame2 = pd.DataFrame(np.arange(12).reshape((4,3)),
                   index=[íblueí,ígreení,íwhiteí,íyellowí],
                   columns=[ímugí,ípení,íballí])
frame1
frame2
frame1+frame2

-- Operations between Data Structures

Flexible Arithmetic Methods

frame1.add(frame2)

-- Operations between DataFrame and Series

frame = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=[íredí,íblueí,íyellowí,íwhiteí],
                   columns=[íballí,ípení,ípencilí,ípaperí])
frame
frame - ser
ser[ímugí] = 9
ser
frame - ser

-- Function Application and Mapping

Functions by Element
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=[íredí,íblueí,íyellowí,íwhiteí],
                   columns=[íballí,ípení,ípencilí,ípaperí])
frame

np.sqrt(frame)

-- Functions by Row or Column

f = lambda x: x.max() - x.min()
def f(x):
    return x.max() - x.min()
frame.apply(f)
frame.apply(f, axis=1)

def f(x):
     return pd.Series([x.min(), x.max()], index=[íminí,ímaxí])
frame.apply(f)

-- Statistics Functions

frame.sum()
frame.mean()
frame.describe()

-- Sorting and Ranking
ser = pd.Series([5,0,3,8,4], index=[íredí,íblueí,íyellowí,íwhiteí,ígreení])
ser
ser.sort_index(ascending=False)
frame = pd.DataFrame(np.arange(16).reshape((4,4)),
                   index=[íredí,íblueí,íyellowí,íwhiteí],
                   columns=[íballí,ípení,ípencilí,ípaperí])
frame
ser.order()
frame.sort_index(by='pen')
frame.sort_index(by='pen', 'pencil'])
ser.rank()
ser.rank(method='first')
ser.rank(ascending=False)

-- Correlation and COvariance
seq2 = pd.Series([3,4,3,4,5,4,3,2],[í2006í,í2007í,í2008í,í2009í,í2010í,í2011í,í2012í,í2013í])
seq = pd.Series([1,2,3,4,4,3,2,1],[í2006í,í2007í,í2008í,í2009í,í2010í,í2011í,í2012í,í2013í])
seq.corr(seq2)

frame2 = DataFrame([[1,4,3,6],[4,5,6,1],[3,3,1,5],[4,1,6,4]],
                     index=[íredí,íblueí,íyellowí,íwhiteí],
                     columns=[íballí,ípení,ípencilí,ípaperí])
frame2
frame2.corr()
frame2.cov()
frame2.corrwith(ser)
frame2.corrwith(frame)

-- ìNot a Numberî Data

Assigning a NaN Value

ser = pd.Series([0,1,2,np.NaN,9], index=[íredí,íblueí,íyellowí,íwhiteí,ígreení])
ser
ser[íwhiteí] = None
ser

Filtering Out NaN Values

ser.dropna()
ser[ser.notnull()]
frame3 = pd.DataFrame([[6,np.nan,6],[np.nan,np.nan,np.nan],[2,np.nan,5]],
                        index = [íblueí,ígreení,íredí],
                        columns = [íballí,ímugí,ípení])
frame3
frame3.dropna(how=íallí) 

-- Filling in NaN Occurrences

frame3.fillna(0)

frame3.fillna({íballí:1,ímugí:0,ípení:99})

-- Hierarchical Indexing and Leveling
mser = pd.Series(np.random.rand(8),
        index=[[íwhiteí,íwhiteí,íwhiteí,íblueí,íblueí,íredí,íredí,íredí],
        [íupí,ídowní,írightí,íupí,ídowní,íupí,ídowní,íleftí]])
mser
mser.index
mser[íwhiteí]
mser[:,íupí]
mser[íwhiteí,íupí]
mser.unstack()

frame
frame.stack()

mframe = pd.DataFrame(np.random.randn(16).reshape(4,4),
      index=[[íwhiteí,íwhiteí,íredí,íredí], [íupí,ídowní,íupí,ídowní]],
      columns=[[ípení,ípení,ípaperí,ípaperí],[1,2,1,2]])
mframe

-- Reordering and Sorting Levels

mframe.columns.names = [íobjectsí,íidí]
mframe.index.names = [ícolorsí,ístatusí]
mframe
mframe.swaplevel('colors', 'status')
mframe.sortlevel(ícolorsí) 

-- Summary Statistic by Level

mframe.sum(level=ícolorsí)
mframe.sum(level=íidí, axis=1)

Tuesday, February 23, 2016

Python Data Analysis 3 - The NumPy Library

-- The NumPy Installation

sudo apt-get install python-numpy

import numpy as np

-- Ndarray: The Heart of the Library

a=np.arrary([1, 2, 3])
a
type(a)
a.dtype
a.ndim
a.size
a.shape

b = np.array([[1.3, 2.4],[0.3, 4.1]])
b.dtype
b.ndim
b.size
b.shape
b.itemsize
b.data

-- Create an Array
c = np.array([[1, 2, 3],[4, 5, 6]])
c
d = np.array(((1, 2, 3),(4, 5, 6)))
d
e = np.array([(1, 2, 3), [4, 5, 6], (7, 8, 9)])
e

-- Types of Data
g = np.array([[’a’, ’b’],[’c’, ’d’]])
g
g.dtype
g.dtype.name


-- The dtype Option
f = np.array([[1, 2, 3],[4, 5, 6]], dtype=complex)
f

-- Intrinsic Creation of an Array
np.zeros((3, 3))
np.ones((3, 3))
np.arange(0, 10)
np.arange(4, 10)
np.arange(0, 12, 3)
np.arange(0, 6, 0.6)
np.arange(0, 12).reshape(3, 4)
np.linspace(0,10,5)
np.random.random(3)
np.random.random((3,3))

-- Basic Operations
-- Arithmetic Operators
a = np.arange(4)
a
a+4
a*2
b = np.arange(4,8)
a+b
a-b
a*b
a * np.sin(b)
a * np.sqrt(b)

A = np.arange(0, 9).reshape(3, 3)
A
B = np.ones((3, 3))
B
A*B

-- The Matrix Product
np.dot(A,B)
A.dot(B)
np.dot(B,A)

-- Increment and Decrement Operators
a = np.arange(4)
a
a += 1
a
a -= 1
a
array([0, 1, 2, 3])
a += 4
a
a *= 2
a

-- Universal Functions (ufunc)
a = np.arange(1, 5)
a
np.sqrt(a)
np.log(a)
np.sin(a)

-- Aggregate Functions
a = np.array([3.3, 4.5, 1.2, 5.7, 0.3])
a.sum()
a.min()
a.max()
a.mean()
a.std()

-- Indexing, Slicing and Iterating
a = np.arange(10, 16)
a
a[4]
a[-1]
a[-6]
a[[1,3,4]]

A = np.arange(10, 19).reshape((3, 3))
A
A[1,2]

a = np.arange(10, 16)
a
a[1:5]
a[1:5:2]
a[::2]
a[:5:2]
a{:5:]

A = np.arange(10, 19).reshape((3, 3))
A
A[0,:]
A[0:2, 0:2]
A[[0,2], 0:2]

Iterating an Array
for i in a:
print a
for row in A;
print row
for item in A.flat;
print item
np.apply_along_axis(np.mean, axis=0, arr=A)
np.apply_along_axis(np.mean, axis=1, arr=A)

def foo(x):
return x/2

np.apply_along_axis(foo, axis=1, arr=A)
np.apply_along_axis(foo, axis=0, arr=A)

-- Conditions and Boolean Arrays
A = np.random.random((4, 4))
A
A<0.5
A[A<0.5]

-- Shape Manipulation
a = np.random.random(12)
a
A = a.reshape(3, 4)
A

a.shape = (3, 4)
a
a = a.ravel()
a.shape = (12)
a
A.transpose()

-- Array Manipulation

-- Joining Arrays

A = np.ones((3, 3))
B = np.zeros((3, 3))
np.vstack((A, B))
B = np.zeros((3, 3))
np.vstack((A, B))
a = np.array([0, 1, 2])
b = np.array([3, 4, 5])
c = np.array([6, 7, 8])
np.column_stack((a, b, c))
np.row_stack((a, b, c))

-- Splitting Arrays
A = np.arange(16).reshape((4, 4))
A
[B,C] = np.hsplit(A, 2)
B
C
[B,C] = np.vsplit(A, 2)
B
C
[A1,A2,A3] = np.split(A,[1,3],axis=1)
A1
A2
A3
[A1,A2,A3] = np.split(A,[1,3],axis=1)
A1
A2
A3

-- General Concepts

Copies or Views of Objects

a = np.array([1, 2, 3, 4])
b = a
b
a[2] = 0
b
c = a[0:2]
c
a[0] = 0
c
a = np.array([1, 2, 3, 4])
c = a.copy()
c
a[0]=0
c

-- Vectorization

a * b
A * B
for (i = 0; i < rows; i++){
  c[i] = a[i]*b[i];
}
for( i=0; i < rows; i++){
   for(j=0; j < columns; j++){
      c[i][j] = a[i][j]*b[i][j];
   }
}

-- Broadcasting
A = np.arange(16).reshape(4, 4)
b = np.arange(4)
A
b
A+b
m = np.arange(6).reshape(3, 1, 2)
m = np.arange(6).reshape(3, 1, 2)
m
n
m+n

-- Structured Arrays

structured = np.array([(1, ’First’, 0.5, 1+2j),(2, ’Second’, 1.3, 2-2j), (3, ’Third’, 0.8, 1+3j)],dtype=(’i2, a6, f4, c8’))
structured
structured = np.array([(1, ’First’, 0.5, 1+2j),(2, ’Second’, 1.3,2-2j), (3, ’Third’, 0.8, 1+3j)],dtype=(’int16, a6, float32, complex64’))
structured
structured[1]
structured[’f1’]
structured = np.array([(1,’First’,0.5,1+2j),(2,’Second’,1.3,2-2j),(3,’Third’,0.8,1+3j)],dtype=[(’id’,’i2’),(’position’,’a6’),(’value’,’f4’),(’complex’,’c8’)])
structured
structured.dtype.names = (’id’,’order’,’value’,’complex’)
structured[’order’]

Reading and Writing Array Data on Files

Loading and Saving Data in Binary Files

data
np.save(’saved_data’,data)
loaded_data = np.load(’saved_data.npy’)
loaded_data

-- Reading and Writing Array Data on Files
-- Loading and Saving Data in Binary Files
data = np.genfromtxt(’data.csv’, delimiter=’,’, names=True)
data

data2 = np.genfromtxt(’data2.csv’, delimiter=’,’, names=True)
data2
data2['id']

data2[0]

Monday, February 22, 2016

Python Data Analysis 2 - Introduction to the Python's World

Chap 2 Introduction to the Python's World


The Python interpreter is simply a program that reads and interprets the commands passed to the prompt. You have seen that the interpreter can accept either a single command at a time or entire files of Python code. However the approach by which it performs this is always the same.

Each time you press the Enter key, the interpreter begins to scan the code written (either a row or a full file of code) token by token (tokenization). These tokens are fragments of text which the interpreter will arrange in a tree structure. The tree obtained is the logical structure of the program which is then converted to bytecode (.pyc or .pyo). The process chain ends with the bytecode which will be executed by a Python virtual machine (PVM).

Using Python

myname = raw_input("What is your name? ")
print "Hi " + myname + ", Iím glad to say: Hello world!"

Wrinting Python Code

1 + 2
(1.045 * 3)/4
4 ** 2
((4 + 5j) * (2 + 3j))
4 < (2*3)
a = 12 * 3.4
a

import math
math.sin(a)
from math import *
sin(a)
from math import sin

Data Structure

list, set, strings, tuples, dictionary, deque, heap

# dictionary
dict = {ínameí:íWilliamí, íageí:25, ícityí:íLondoní}
dict["name"]

for key, value in dict.items():
 print(key,value)

list = [1,2,3,4]
list
list[2]
list[1:3]
list[-1]
items = [1,2,3,4,5]
for item in items:
 item + 1

# Functional Programming (Only for Python 3.4)

map(function, list)
filter(function, list)
reduce(function, list)
lambda
list comprehension

items = [1,2,3,4,5]
def inc(x): return x+1
list(map(inc,items))
list(map((lambda x: x+1),items))
list(filter((lambda x: x < 4), items))
from functools import reduce
reduce((lambda x,y: x/y), items)

S = [x**2 for x in range(5)] 
S

# Indentation

a = 4
if a > 3:
 if a < 5:
         print("Iím four")
 else:
  print("Iím a little number")

if a > 3:
 if a < 5:
  print("Iím four")
 else:
  print("Iím a big number")

IPython

IPython is a further development of Python that includes a number of tools: IPython shell, a powerful interactive shell resulting in a greatly enhanced Python terminal; a QtConsole, which is a hybrid between a shell and a GUI, allowing in this way to display graphics inside the console instead of in separate windows; and finally the IPython Notebook, which is a web interface that allows you to mix text, executable code, graphics, and formulas in a single representation.

In [1]: print "Hello World!"
Hello World! 

In [2]: 3/2
Out[2]: 1

In [3]: 5.0/2
Out[3]: 2.5

In [4]: In
Out[4]: [íí, uíprint "Hello World!"í, uí3/2í, uí5.0/2í, uí_i2í, uíIní]

In [5]: In[3]
Out[5]: uí5.0/2í

{2: 1,
 3: 2.5,
 4: [íí,
  uíprint "Hello World!"í,
  uí3/2í,
  uí5.0/2í,
  uí_i2í,
  uíIní,
  uíIn[3]í,
  uíOutí], 
 5: uí5.0/2í}

SciPy

SciPy (pronounced ìSigh Pieî) is a set of open-source Python libraries specialized for scientific computing. Many of these libraries will be the protagonists of many chapters of the book, given that their knowledge is critical to the data analysis. Together they constitute a set of tools for calculating and displaying data that has little to envy from other specialized environments for calculation and data analysis (such as R or Matlab). 

 NumPy is the foundation library for scientific computing
 Pandas provides complex data structures and functions specifically designed to make the work on them easy, fast, and effective.
 matplotlib is currently most popular for producing plots and other data visualizations in 2D.

Saturday, February 13, 2016

Python Data Analysis 1 - An Introduction to Data Analysis in Python

The Data Analysis Process

Problem definition
The definition step and the corresponding documentation (deliverables) of the scientific problem or business are both very important in order to focus the entire analysis strictly on getting results. In fact, a comprehensive or exhaustive study of the system is sometimes complex and you do not always have enough information to start with. So the definition of the problem and especially its planning can determine uniquely the guidelines to follow for the whole project.

Data extraction
When you want to get the data, a good place to start is just the Web. But most of the data on the Web can be difficult to capture; in fact, not all data are available in a file or database, but can be more or less implicitly content that is inside HTML pages in many different formats. To this end, a methodology called Web Scraping, which allows the collection of data through the recognition of specific occurrence of HTML tags within the web pages, has been developed. There are software specifically designed for this purpose, and once an occurrence is found, they extract the desired data. Once the search is complete, you will get a list of data ready to be subjected to the data analysis.

Data preparation
The preparation of the data is concerned with obtaining, cleaning, normalizing, and transforming data into an optimized data set, that is, in a prepared format, normally tabular, suitable for the methods of analysis that have been scheduled during the design phase.

Data exploration
Exploring the data is essentially the search for data in a graphical or statistical presentation in order to find patterns, connections, and relationships in the data. Data visualization is the best tool to highlight possible patterns.

Generally, the data analysis requires processes of summarization of statements regarding the data to be studied. The summarization is a process by which data are reduced to interpretation without sacrificing important information.

Clustering is a method of data analysis that is used to find groups united by common attributes (grouping).

Another important step of the analysis focuses on the identification of relationships, trends, and anomalies in the data. In order to find out this kind of information, one often has to resort to the tools as well as performing another round of data analysis, this time on the data visualization itself.

Other methods of data mining, such as decision trees and association rules, automatically extract important facts or rules from data. These approaches can be used in parallel with the data visualization to find information about the relationships between the data.

Predictive modeling
Classification models: If the result obtained by the model type is categorical.
Regression models: If the result obtained by the model type is numeric.
Clustering models: If the result obtained by the model type is descriptive.

Model validation/test
Generally, you will refer to the data as the training set, when you are using them for building the model, and as the validation set, when you are using them for validating the model.

Deployment of the solution
The deployment basically consists of putting into practice the results obtained from the data analysis.


Open Data

DataHub (http://datahub.io/dataset)
World Health Organization (http://www.who.int/research/en/)
Data.gov (http://data.gov)
European Union Open Data Portal (http://open-data.europa.eu/en/data/)
Amazon Web Service public datasets (http://aws.amazon.com/datasets)
Facebook Graph (http://developers.facebook.com/docs/graph-api)
Healthdata.gov (http://www.healthdata.gov)
Google Trends (http://www.google.com/trends/explore)
Google Finance (https://www.google.com/finance)
Google Books Ngrams (http://storage.googleapis.com/books/ngrams/books/datasetsv2.html)
Machine Learning Repository (http://archive.ics.uci.edu/ml/)


Friday, February 12, 2016

Master R 14 - Analyzing the R community

R Foundation members

library(XML)
page <- htmlParse('http://r-project.org/foundation/donors.html')
list <- unlist(xpathApply(page, "//h3[@id='supporting-members']/following-sibling::ul[1]/li", xmlValue))
str(list)

supporterlist <- sub(' \\([a-zA-Z ]*\\)$', '', list)
countrylist   <- substr(list, nchar(supporterlist) + 3, nchar(list) - 1)
tail(sort(prop.table(table(countrylist)) * 100), 5)

Visualizing supporting members around the world

countries <- as.data.frame(table(countrylist))
library(rworldmap)
joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist', verbose = TRUE)

library(ggmap)
for (fix in c('Brasil', 'CZ', 'Danmark', 'NL')) {
   countrylist[which(countrylist == fix)] <-
       geocode(fix, output = 'more')$country
}

countries <- as.data.frame(table(countrylist))
countries <- joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist')
mapCountryData(countries, 'Freq', catMethod = 'logFixedWidth', mapTitle = 'Number of R Foundation supporting members')


R package maintainers

packages <- readHTMLTable(paste0('http://cran.r-project.org', '/web/checks/check_summary.html'), which = 2)
maintainers <- sub('(.*) <(.*)>', '\\1', packages$' Maintainer')
maintainers <- gsub(' ', ' ', maintainers)
str(maintainers)
tail(sort(table(maintainers)), 8)


The number of packages per maintainer

N <- as.numeric(table(maintainers))
library(fitdistrplus)
plotdist(N)
descdist(N, boot = 1e3)
(gparams <- fitdist(N, 'gamma'))
gshape <- gparams$estimate[['shape']]
grate  <- gparams$estimate[['rate']]
sum(rgamma(1e5, shape = gshape, rate = grate))
hist(rgamma(1e5, shape = gshape, rate = grate))
pgamma(2, shape = gshape, rate = grate)
prop.table(table(N <= 2))
ploc <- min(N)
pshp <- length(N) / sum(log(N) - log(ploc))

library(actuar)
ppareto(2, pshp, ploc)
fg <- fitdist(N, 'gamma')
fw <- fitdist(N, 'weibull')
fl <- fitdist(N, 'lnorm')
fp <- fitdist(N, 'pareto', start = list(shape = 1, scale = 1))
par(mfrow = c(1, 2))
denscomp(list(fg, fw, fl, fp), addlegend = FALSE)
qqcomp(list(fg, fw, fl, fp),  legendtext = c('gamma', 'Weibull', 'Lognormal', 'Pareto')) 
length(unique(maintainers))


The R-help mailing list

library(RCurl)
url <- getURL('https://stat.ethz.ch/pipermail/r-help/')
R.help.toc <- htmlParse(url)
R.help.archives <- unlist(xpathApply(R.help.toc, "//table//td[3]/a", xmlAttrs), use.names = FALSE)
dir.create('r-help')
for (f in R.help.archives)
     download.file(url = paste0(url, f), file.path('help-r', f), method = 'curl'))
lines <- system(paste0("zgrep -E '^From: .* at .*' ./help-r/*.txt.gz"), intern = TRUE)
length(lines)
length(unique(lines))
lines[26]
lines    <- sub('.*From: ', '', lines)
Rhelpers <- sub('.*\\((.*)\\)', '\\1', lines)
tail(sort(table(Rhelpers)), 6)
grep('Brian( D)? Ripley', names(table(Rhelpers)), value = TRUE)

Volume of the R-help mailing list

lines <- system(paste0(
"zgrep -E '^Date: [A-Za-z]{3}, [0-9]{1,2} [A-Za-z]{3} ",
"[0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2} [-+]{1}[0-9]{4}' ",
"./help-r/*.txt.gz"), intern = TRUE)
length(lines)
head(sub('.*Date: ', '', lines[1]))
times <- strptime(sub('.*Date: ', '', lines), format = '%a, %d %b %Y %H:%M:%S %z')
plot(table(format(times, '%Y')), type = 'l')

library(data.table)
Rhelp <- data.table(time = times)
Rhelp[, H := hour(time)]
Rhelp[, D := wday(time)]
library(ggplot2)
ggplot(na.omit(Rhelp[, .N, by = .(H, D)]),
     aes(x = factor(H), y = factor(D), size = N)) + geom_point() +
     ylab('Day of the week') + xlab('Hour of the day') +
     ggtitle('Number of mails posted on [R-help]') +
     theme_bw() + theme('legend.position' = 'top')
tail(sort(table(sub('.*([+-][0-9]{4}).*', '\\1', lines))), 22)

Forecasting the e-mail volume in the future

Rhelp[, date := as.Date(time)]
Rdaily <- na.omit(Rhelp[, .N, by = date])
Rdaily <- zoo(Rdaily$N, Rdaily$date)
plot(Rdaily)

library(forecast)
fit <- ets(Rdaily)
predict(fit, 1)
plot(forecast(fit, 30), include = 365)


Analyzing overlaps between our lists of R users

lists <- rbindlist(list(
data.frame(name = unique(supporterlist), list = 'supporter'),
data.frame(name = unique(maintainers),   list = 'maintainer'),
data.frame(name = unique(Rhelpers),      list = 'R-help')))

t <- table(lists$name, lists$list)
table(rowSums(t))
library(Rcapture)
descriptive(t)
closedp(t)

Further ideas on extending the capture-recapture models


The number of R users in social media

library(fbRads)
fbad_init(FB account ID, FB API token)
fbad_get_search(q = 'rstats', type = 'adinterest')
fbad_get_search(fbacc = fbacc, q = 'SPSS', type = 'adinterest')
res <- fbad_get_search(fbacc = fbacc, q = 'programming language', type = 'adinterest')
res <- res[order(res$audience_size, decreasing = TRUE), ]
res[1:10, 1:3]


R-related posts in social media

library(twitteR)
setup_twitter_oauth(...)
str(searchTwitter("#rstats", n = 1, resultType = 'recent'))
tweets <- Rtweets(n = 500)
length(strip_retweets(tweets))
tweets <- twListToDF(tweets)

library(tm)
corpus <- Corpus(VectorSource(tweets$text))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, 'rstats')


library(wordcloud)
wordcloud(corpus)

Wednesday, February 10, 2016

Master R 13 - Data Around Us

Geocoding

library(hflights)
library(data.table)
dt <- data.table(hflights)[, list(
    N         = .N,
    Cancelled = sum(Cancelled),
    Distance  = Distance[1],
    TimeVar   = sd(ActualElapsedTime, na.rm = TRUE),
    ArrDelay  = mean(ArrDelay, na.rm = TRUE)) , by = Dest]

str(dt)

library(ggmap)
(h <- geocode('Houston, TX'))
dt[, c('lon', 'lat') := geocode(Dest)]

str(setDF(dt))


Visualizing point data in space

library(maps)
map('state')
title('Flight destinations from Houston,TX')
points(h$lon, h$lat, col = 'blue', pch = 13)
points(dt$lon, dt$lat, col = 'red', pch = 19)
text(dt$lon, dt$lat + 1, labels = dt$Dest, cex = 0.7)

map('state')
title('Frequent flight destinations from Houston,TX')
points(h$lon, h$lat, col = 'blue', pch = 13)
points(dt$lon, dt$lat, pch = 19, col = rgb(1, 0, 0, dt$N / max(dt$N)))
legend('bottomright', legend = round(quantile(dt$N)), pch = 19, col = rgb(1, 0, 0, quantile(dt$N) / max(dt$N)), box.col = NA)


Finding polygon overlays of point data

str(map_data <- map('state', plot = FALSE, fill = TRUE))
grep('^washington', map_data$names, value = TRUE)
states <- sapply(strsplit(map_data$names, ':'), '[[', 1)
library(maptools)
us <- map2SpatialPolygons(map_data, IDs = states, proj4string = CRS("+proj=longlat +datum=WGS84"))
plot(us)

library(sp)
dtp <- SpatialPointsDataFrame(dt[, c('lon', 'lat')], dt, proj4string = CRS("+proj=longlat +datum=WGS84"))
str(sp::over(us, dtp))
str(sapply(sp::over(us, dtp, returnList = TRUE), function(x) sum(x$Cancelled)))
val <- cancels$Cancelled[match(states, row.names(cancels))]
val[is.na(val)] <- 0


Plotting thematic maps

map("state", col = rgb(1, 0, 0, sqrt(val/max(val))), fill = TRUE)
title('Number of cancelled flights from Houston to US states')
points(h$lon, h$lat, col = 'blue', pch = 13)
legend('bottomright', legend = round(quantile(val)), fill = rgb(1, 0, 0, sqrt(quantile(val)/max(val))), box.col = NA)

Rendering polygons around points

Contour lines

library(fields)
out <- as.image(dt$ArrDelay, x = dt[, c('lon', 'lat')], nrow = 256, ncol = 256)
table(is.na(out$z))
image(out)
image(out, xlim = base::range(map_data$x, na.rm = TRUE), ylim = base::range(map_data$y, na.rm = TRUE))
look <- image.smooth(out, theta = .5)
table(is.na(look$z))
image(look)

usa_data <- map('usa', plot = FALSE, region = 'main')
p <- expand.grid(look$x, look$y)
library(sp)
n <- which(point.in.polygon(p$Var1, p$Var2, usa_data$x, usa_data$y) == 0)
look$z[n] <- NA

map("usa")
image(look, add = TRUE)
map("state", lwd = 3, add = TRUE)
title('Arrival delays of flights from Houston')
points(dt$lon, dt$lat, pch = 19, cex = .5)
points(h$lon, h$lat, pch = 13)

Voronoi diagrams

library(deldir)
map("usa")
plot(deldir(dt$lon, dt$lat), wlines = "tess", lwd = 2, pch = 19, col = c('red', 'darkgray'), add = TRUE)


Satellite maps

library(OpenStreetMap)
map <- openmap(c(max(map_data$y, na.rm = TRUE),
                  min(map_data$x, na.rm = TRUE)),
                c(min(map_data$y, na.rm = TRUE),
                  max(map_data$x, na.rm = TRUE)),
                type = 'stamen-terrain')
map <- openproj(map, projection = '+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs')
plot(map)
plot(deldir(dt$lon, dt$lat), wlines = "tess", lwd = 2, col = c('red', 'black'), pch = 19, cex = 0.5, add = TRUE)


Interactive maps

Querying Google Maps

cancels$state <- rownames(cancels)
cancels$Cancelled[is.na(cancels$Cancelled)] <- 0

library(googleVis)
plot(gvisGeoChart(cancels, 'state', 'Cancelled',
                   options = list(
                       region      = 'US',
                       displayMode = 'regions', 
                       resolution  = 'provinces')))
dt$LatLong <- paste(dt$lat, dt$lon, sep = ':')
dt$tip <- apply(dt, 1, function(x) paste(names(dt), x, collapse = '<br/ >'))
plot(gvisMap(dt, 'LatLong', tipvar = 'tip'))


JavaScript mapping libraries

devtools::install_github('ramnathv/rCharts')
library(rCharts)
map <- Leaflet$new()
map$setView(as.numeric(dt[which(dt$Dest == 'MCI'),
   c('lat', 'lon')]), zoom = 4)
for (i in 1:nrow(dt))
   map$marker(c(dt$lat[i], dt$lon[i]), bindPopup = dt$tip[i])
map$show()

library(leaflet)
leaflet(us) %>%
   addProviderTiles("Acetate.terrain") %>%
   addPolygons() %>%
   addMarkers(lng = dt$lon, lat = dt$lat, popup = dt$tip)


Alternative map designs

dt <- dt[point.in.polygon(dt$lon, dt$lat, usa_data$x, usa_data$y) == 1, ]

library(diagram)
library(scales)
map("usa")
title('Number of flights, cancellations and delays from Houston')
image(look, add = TRUE)
map("state", lwd = 3, add = TRUE)
for (i in 1:nrow(dt)) {
   curvedarrow(
     from       = rev(as.numeric(h)),
     to         = as.numeric(dt[i, c('lon', 'lat')]),
     arr.pos    = 1,
     arr.type   = 'circle',
     curve      = 0.1,
     arr.col    = alpha('black', dt$N[i] / max(dt$N)),
     arr.length = dt$N[i] / max(dt$N),
     lwd        = dt$Cancelled[i] / max(dt$Cancelled) * 25,
     lcol       = alpha('black',
                    dt$Cancelled[i] / max(dt$Cancelled)))
 }


Spatial statistics

dm <- dist(dt[, c('lon', 'lat')])
dm <- as.matrix(dm)
idm <- 1 / dm
diag(idm) <- 0
str(idm)
dt$TimeVar[is.na(dt$TimeVar)] <- 0
library(ape)
Moran.I(dt$TimeVar, idm)

library(spdep)
idml <- mat2listw(idm)
moran.test(dt$TimeVar, idml)
idml <- mat2listw(idm, style = "W")

moran.test(dt$TimeVar, idml)

Master R 12 - Analyzing Time-series

Creating time-series objects

library(hflights)
library(data.table)
dt <- data.table(hflights)
dt[, date := ISOdate(Year, Month, DayofMonth)]

daily <- dt[, list(
     N         = .N,
     Delays    = sum(ArrDelay, na.rm = TRUE),
     Cancelled = sum(Cancelled),
     Distance  = mean(Distance)
 ), by = date]
str(daily)


Visualizing time-series

plot(ts(daily))
setorder(daily, date)
plot(ts(daily))
plot(ts(daily$N, start = 2011, frequency = 365), main = 'Number of flights from Houston in 2011')


Seasonal decomposition

plot(decompose(ts(daily$N, frequency = 7)))
setNames(decompose(ts(daily$N, frequency = 7))$figure, weekdays(daily$date[1:7]))
decompose(ts(daily$N, frequency = 365))


Holt-Winters filtering

nts <- ts(daily$N, frequency = 7)
fit <- HoltWinters(nts, beta = FALSE, gamma = FALSE)
plot(fit)
fit <- HoltWinters(nts)
plot(fit)
library(forecast)
forecast(fit)
plot(forecast(HoltWinters(nts), 31))


Autoregressive Integrated Moving Average models

auto.arima(nts)
auto.arima(nts, approximation = FALSE)
plot(forecast(auto.arima(nts, D = 1, approximation = FALSE), 31))


Outlier detection

cts <- ts(daily$Cancelled)
fit <- auto.arima(cts)
auto.arima(cts)

library(tsoutliers)
outliers <- tso(cts, tsmethod = 'arima',
   args.tsmethod  = list(order = c(1, 1, 2)))
plot(outliers)
plot(tso(ts(daily$Cancelled)))
dfc <- as.data.frame(daily[, c('date', 'Cancelled'), with = FALSE])

library(AnomalyDetection)
AnomalyDetectionTs(dfc, plot = TRUE)$plot


More complex time-series objects

library(zoo)
zd <- zoo(daily[, -1, with = FALSE], daily[[1]])
plot(zd)

plot(cumsum(zd))

Tuesday, February 9, 2016

Master R 11 - Social Network analysis of the R Ecosystem

Loading network data

library(tools)
pkgs <- available.packages()
str(pkgs)

head(package.dependencies(pkgs), 2)
library(plyr)
edges <- ldply(
   c('Depends', 'Imports', 'Suggests'), function(depLevel) {
     deps <- package.dependencies(pkgs, depLevel = depLevel)
     ldply(names(deps), function(pkg)
         if (!identical(deps[[pkg]], NA))
             data.frame(
                 src   = pkg,
                 dep   = deps[[pkg]][, 1],
                 label = depLevel,
                 stringsAsFactors = FALSE))
})

str(edges)


Centrality measures of networks
 compute centrality and density metrics
 identify bridges and simulate changes in the network

nrow(edges) / (nrow(pkgs) * (nrow(pkgs) - 1))
head(sort(table(edges$dep), decreasing = TRUE))
edges <- edges[edges$dep != 'R', ]

library(igraph)
g <- graph.data.frame(edges)
summary(g)
graph.density(g)
head(sort(degree(g), decreasing = TRUE))
head(sort(betweenness(g), decreasing = TRUE))


Visualizing network data

plot(degree(g), betweenness(g), type = 'n', main = 'Centrality of R package dependencies')
text(degree(g), betweenness(g), labels = V(g)$name)

edges <- edges[edges$label != 'Suggests', ]
deptree <- edges$dep[edges$src == 'igraph']
while (!all(edges$dep[edges$src %in% deptree] %in% deptree))
   deptree <- union(deptree, edges$dep[edges$src %in% deptree])
deptree

g <- graph.data.frame(edges[edges$src %in% c('igraph', deptree), ])
plot(g)

V(g)$label.color <- 'orange'
V(g)$label.color[V(g)$name == 'igraph'] <- 'darkred'
V(g)$label.color[V(g)$name %in% edges$dep[edges$src == 'igraph']] <- 'orangered'
E(g)$color <- c('blue', 'green')[factor(df$label)]
plot(g, vertex.shape = 'none', edge.label = NA)
tkplot(g, edge.label = NA)

library(visNetwork)
nodes <- get.data.frame(g, 'vertices')
names(nodes) <- c('id', 'color')
edges <- get.data.frame(g)
visNetwork(nodes, edges)

Custom plot layouts

g <- dominator.tree(g, root = "igraph")$domtree
plot(g, layout = layout.reingold.tilford(g, root = "igraph"), vertex.shape = 'none')

Analyzing R package dependencies with an R package

library(miniCRAN)
pkgs <- pkgAvail()
pkgDep('igraph', availPkgs = pkgs, suggests = FALSE, includeBasePkgs = TRUE)

plot(makeDepGraph('igraph', pkgs, suggests = FALSE, includeBasePkgs = TRUE))

Monday, February 8, 2016

Master R 10 - Classification and Clustering

Cluster analysis

# Hierarchical clustering

d <- dist(mtcars)
h <- hclust(d)
h
plot(h)
rect.hclust(h, k=3, border = "red")
cn <- cutree(h, k=3)
table(cn)
round(aggregate(mtcars, FUN = mean, by = list(cn)), 1)
round(aggregate(mtcars, FUN = sd, by = list(cn)), 1)
round(sapply(mtcars, sd), 1)
round(apply(aggregate(mtcars, FUN = mean, by = list(cn)),2, sd), 1)

# Determining the ideal number of clusters

install.packages('NbClust')
library(NbClust)
NbClust(mtcars, method = 'complete', index = 'dindex')
NbClust(mtcars, method = 'complete', index = 'hartigan')$Best.nc
NbClust(mtcars, method = 'complete', index = 'kl')$Best.nc
NbClust(iris[, -5], method = 'complete', index = 'all')$Best.nc[1,]

# K-means clustering

(k <- kmeans(mtcars, 3))
all(cn == k$cluster)

# Visualizing clusters
library(cluster) 
clusplot(mtcars, k$cluster, color = TRUE, shade = TRUE, labels = 2)


# Latent class models
factors <- c('cyl', 'vs', 'am', 'carb', 'gear')
mtcars[, factors] <- lapply(mtcars[, factors], factor)

# Latent Class Analysis
install.packages('poLCA')
library(poLCA)
p <- poLCA(cbind(cyl, vs, am, carb, gear) ~ 1,data = mtcars, graphs = TRUE, nclass = 3)
p$P

# Latent class regression
# Discriminant analysis
rm(mtcars)
mtcars$gear <- factor(mtcars$gear)
library(MASS)
d <- lda(gear ~ ., data = mtcars, CV =TRUE)
(tab <- table(mtcars$gear, d$class))
sum(diag(tab)) / sum(tab)
round(d$posterior, 4)
d <- lda(gear ~ ., data = mtcars)
plot(d)
plot(d, dimen = 1, type = "both" )


# Logistic regression
lr <- glm(am ~ hp + wt, data = mtcars, family = binomial)
summary(lr)
table(mtcars$am, round(predict(lr, type = 'response')))

install.packages('nnet')
library(nnet) 
(mlr <- multinom(factor(gear) ~ ., data = mtcars)) 
table(mtcars$gear, predict(mlr))
rm(mtcars)


# Machine learning algorithms
# The K-Nearest Neighbors algorithm

#K-Nearest Neighbors is a supervised classification algorithm, which is a mostly used in pattern recognition and business analytics. A big advantage of k-NN is that it is not sensitive to outliers, and the usage is extremely straightforward.

set.seed(42)
n     <- nrow(mtcars)
train <- mtcars[sample(n, n/2), ]

library(dplyr)
train <- sample_n(mtcars, n / 2)
test <- mtcars[setdiff(row.names(mtcars), row.names(train)), ]
library(class)
(cm <- knn(
  train = subset(train, select = -gear),
  test  = subset(test, select = -gear),
  cl    = train$gear,
  k     = 5))
cor(test$gear, as.numeric(as.character(cm)))
table(train$gear)

# Classification trees
library(rpart)
ct <- rpart(factor(gear) ~ ., data = train, minsplit = 3)
summary(ct)
plot(ct)
text(ct)
table(test$gear, predict(ct, newdata = test, type = 'class'))

install.packages('party')
library(party)
ct <- ctree(factor(gear) ~ drat, data = train, controls = ctree_control(minsplit = 3)) 
plot(ct, main = "Conditional Inference Tree")
table(test$gear, predict(ct, newdata = test, type = 'node'))

install.packages('randomForest')
library(randomForest)
(rf <- randomForest(factor(gear) ~ ., data = train, ntree = 250))
table(test$gear, predict(rf, test))   
plot(rf)
legend('topright', legend = colnames(rf$err.rate), col    = 1:4, fill   = 1:4, bty    = 'n')

# Other algorithms
install.packages(c('caret','c50'))
library(caret)
library(C50)
C50 <- train(factor(gear) ~ ., data = train, method = 'C5.0')
summary(C50)
table(test$gear, predict(C50, test))

Saturday, February 6, 2016

Master R 9 - From Big to Small Data

Adequacy tests

Normality

library(hlfights)
JFK <- hflights[which(hflights$Dest == 'JFK'), c('TaxiIn', 'TaxiOut')]
JFK <- subset(hflights, Dest == 'JFK', select = c(TaxiIn, TaxiOut))

par(mfrow = c(1, 2))
qqnorm(JFK$TaxiIn, ylab = 'TaxiIn')
qqline(JFK$TaxiIn)
qqnorm(JFK$TaxiOut, ylab = 'TaxiOut')
qqline(JFK$TaxiOut)

shapiro.test(JFK$TaxiIn)

Multivariate normality

JFK <- na.omit(JFK)
library(MVN)
mardiaTest(JFK)
hzTest(JFK)
roystonTest(JFK)
par(mfrow = c(1, 2))
mvnPlot(mvt, type = "contour", default = TRUE)
mvnPlot(mvt, type = "persp", default = TRUE)

set.seed(42)
mvt <- roystonTest(MASS::mvrnorm(100, mu = c(0, 0), Sigma = matrix(c(10, 3, 3, 2), 2)))
mvnPlot(mvt, type = "contour", default = TRUE)
mvnPlot(mvt, type = "persp", default = TRUE)


Dependence of variables

hflights_numeric <- hflights[, which(sapply(hflights, is.numeric))]
cor(hflights_numeric, use = "pairwise.complete.obs")
str(cor(hflights_numeric, use = "pairwise.complete.obs"))
hflights_numeric <- hflights[,which(sapply(hflights, function(x) is.numeric(x) && var(x, na.rm = TRUE) != 0))]
table(is.na(cor(hflights_numeric, use = "pairwise.complete.obs")))

library(ellipse)
plotcorr(cor(hflights_numeric, use = "pairwise.complete.obs"))
plotcorr(cor(data.frame(
1:10,
1:10 + runif(10),
1:10 + runif(10) * 5,
runif(10),
10:1,
check.names = FALSE)))
cor(hflights$FlightNum, hflights$Month)

KMO and Barlett's test

library(psych)
KMO(cor(hflights_numeric, use = "pairwise.complete.obs"))
cor(hflights_numeric[, c('Cancelled', 'AirTime')])
cancelled <- which(hflights_numeric$Cancelled == 1)
table(hflights_numeric$AirTime[cancelled], exclude = NULL)
table(hflights_numeric$Cancelled)
hflights_numeric <- subset(hflights_numeric, select = -Cancelled)
which(is.na(cor(hflights_numeric, use = "pairwise.complete.obs")), arr.ind = TRUE)
hflights_numeric <- subset(hflights_numeric, select = -Diverted)
KMO(cor(hflights_numeric[, -c(14)], use = "pairwise.complete.obs"))
KMO(mtcars)
cortest.bartlett(cor(mtcars))


Principal Component Analysis

PCA algorithms
prcomp(mtcars, scale = TRUE)
summary(prcomp(mtcars, scale = TRUE))
sum(prcomp(scale(mtcars))$sdev^2)

Determining the number of components

prcomp(scale(mtcars))$sdev^2
VSS.scree(cor(mtcars))
scree(cor(mtcars))
fa.parallel(mtcars)

Interpreting components

pc <- prcomp(mtcars, scale = TRUE)
head(pc$x[, 1:2])
head(scale(mtcars) %*% pc$rotation[, 1:2])
summary(pc$x[, 1:2])
apply(pc$x[, 1:2], 2, sd)
round(cor(pc$x))
pc$rotation[, 1:2]

biplot(pc, cex = c(0.8, 1.2))
abline(h = 0, v = 0, lty = 'dashed')
cor(mtcars, pc$x[, 1:2])

Rotation methods

varimax(pc$rotation[, 1:2])
pcv <- varimax(pc$rotation[, 1:2])$loadings
plot(scale(mtcars) %*% pcv, type = 'n', xlab = 'Transmission', ylab = 'Power')
text(scale(mtcars) %*% pcv, labels = rownames(mtcars))


library(GPArotation)
promax(pc$rotation[, 1:2])

Outlier-detection with PCA

library(jpeg)
t <- tempfile()
download.file('http://bit.ly/nasa-img', t)
img <- readJPEG(t)
str(img)
h <- dim(img)[1]
w <- dim(img)[2]
m <- matrix(img, h*w)
str(m)
pca <- prcomp(m)
summary(pca)
pca$rotation
extractColors <- function(x) rgb(x[1], x[2], x[3])
(colors <- apply(abs(pca$rotation), 2, extractColors))
pie(pca$sdev, col = colors, labels = colors)

par(mfrow = c(1, 2), mar = rep(0, 4))
image(matrix(pca$x[, 1], h), col = gray.colors(100))
image(matrix(pca$x[, 2], h), col = gray.colors(100), yaxt = 'n')


Factor analysis

m <- subset(mtcars, select = c(mpg, cyl, hp, carb))
(f <- fa(m))
fa.diagram(f)
cor(f$scores, mtcars$disp)


Principal Component Analysis versus Factor Analysis

PCA is used to reduce the number of variables by creating principal components that then can be used in further projects instead of the original variables. This means that we try to extract the essence of the dataset in the means of artificially created variables, which best describe the variance of the data.

FA is the other way around, as it tries to identify unknown, latent variables to explain the original data. In plain English, we use the manifest variables from our empirical dataset to guess the internal structure of the data.


Multidimensional Scaling

as.matrix(eurodist)[1:5, 1:5]
(mds <- cmdscale(eurodist))
plot(mds)
plot(mds, type = 'n')
text(mds[, 1], mds[, 2], labels(eurodist))
mds <- cmdscale(dist(mtcars))
plot(mds, type = 'n')

text(mds[, 1], mds[, 2], rownames(mds))

Thursday, February 4, 2016

Master R 8 - Polishing data

The types and origins of missing data

Missing Completely at Random (MCAR) means that every value in the dataset has the same probability of being missed, 
so no systematic error or distortion is to be expected due to missing data, and nor can we explain the pattern of missing values.

Missing at Random (MAR) is known or at least can be identified, although it has nothing to do with the actual missing values. 

For Missing Not at Random (MNAR), where data is missing for a specific reason that is highly related to the actual question, 
which classifies missing values as nonignorable non-response.


Identifying missing data

library(hflights)
table(complete.cases(hflights))
prop.table(table(complete.cases(hflights))) * 100
sort(sapply(hflights, function(x) sum(is.na(x))))

By-passing missing values

mean(cor(apply(hflights, 2, function(x) as.numeric(is.na(x)))), na.rm = TRUE)
Funs <- Filter(is.function, sapply(ls(baseenv()), get, baseenv()))
names(Filter(function(x) any(names(formals(args(x))) %in% 'na.rm'), Funs))
names(Filter(function(x) any(names(formals(args(x))) %in% 'na.rm'), Filter(is.function,sapply(ls('package:stats'), get, 'package:stats'))))

Overriding the default arguments of a function

myMean <- function(...) mean(..., na.rm = TRUE)
mean(c(1:5, NA))
myMean(c(1:5, NA))

library(rapportools)
mean(c(1:5, NA))
detach('package:rapportools')
mean(c(1:5, NA))

library(Defaults)
setDefaults(mean.default, na.rm = TRUE)
mean(c(1:5, NA))
setDefaults(mean, na.rm = TRUE)
mean

Getting rid of missing data

na.omit(c(1:5, NA))
na.exclude(c(1:5, NA))

x <- rnorm(10); y <- rnorm(10)
x[1] <- NA; y[2] <- NA
exclude <- lm(y ~ x, na.action = "na.exclude")
omit <- lm(y ~ x, na.action = "na.omit")
residuals(exclude)

m <- matrix(1:9, 3)
m[which(m %% 4 == 0, arr.ind = TRUE)] <- NA
m
na.omit(m)

Filtering missing data before or during the actual analysis

mean(hflights$ActualElapsedTime)
mean(hflights$ActualElapsedTime, na.rm = TRUE)
mean(na.omit(hflights$ActualElapsedTime))

library(microbenchmark)
NA.RM   <- function() mean(hflights$ActualElapsedTime, na.rm = TRUE)
NA.OMIT <- function() mean(na.omit(hflights$ActualElapsedTime))
microbenchmark(NA.RM(), NA.OMIT())

Data imputation

m[which(is.na(m), arr.ind = TRUE)] <- 0
m

ActualElapsedTime <- hflights$ActualElapsedTime
mean(ActualElapsedTime, na.rm = TRUE)
ActualElapsedTime[which(is.na(ActualElapsedTime))] <- mean(ActualElapsedTime, na.rm = TRUE)
mean(ActualElapsedTime)


library(Hmisc)
mean(impute(hflights$ActualElapsedTime, mean))
sd(hflights$ActualElapsedTime, na.rm = TRUE)
sd(ActualElapsedTime)
summary(iris)

library(missForest)
set.seed(81)
miris <- prodNA(iris, noNA = 0.2)
summary(miris)
iiris <- missForest(miris, xtrue = iris, verbose = TRUE)
str(iiris)

Comparing different imputation methods

miris <- miris[, 1:4]
iris_mean <- impute(miris, fun = mean)
iris_forest <- missForest(miris)
diag(cor(iris[, -5], iris_mean))
diag(cor(iris[, -5], iris_forest$ximp))

Not imputing missing values

Multiple imputation

Extreme values and outliers

detach('package:missForest')
detach('package:randomForest')

library(outliers)
outlier(hflights$DepDelay)
summary(hflights$DepDelay)

library(lattice)
bwplot(hflights$DepDelay)
IQR(hflights$DepDelay, na.rm = TRUE)

Testing extreme values

set.seed(83)
dixon.test(c(runif(10), pi))
model <- lm(hflights$DepDelay ~ 1)
model$coefficients
mean(hflights$DepDelay, na.rm = TRUE)
a <- 0.1
(n <- length(hflights$DepDelay))
(F <- qf(1 - (a/n), 1, n-2, lower.tail = TRUE))
(L <- ((n - 1) * F / (n - 2 + F))^0.5)
sum(abs(rstandard(model)) > L)

Using robust method

summary(lm(Sepal.Length ~ Petal.Length, data = miris))
lm(Sepal.Length ~ Petal.Length, data = iris)$coefficients

library(MASS)
summary(rlm(Sepal.Length ~ Petal.Length, data = miris))
f <- formula(Sepal.Length ~ Petal.Length)
cbind(orig =  lm(f, data = iris)$coefficients, lm   =  lm(f, data = miris)$coefficients, rlm  = rlm(f, data = miris)$coefficients)

miris$Sepal.Length[1] <- 14

cbind(orig = lm(f, data = iris)$coefficients, lm   = lm(f, data = miris)$coefficients, rlm  = rlm(f, data = miris)$coefficients)

Wednesday, February 3, 2016

Master R 7 - Unstructured Data

library(tm)
getSources()
getReaders()

# Importing the corpus
res <- XML::readHTMLTable(paste0('http://cran.r-project.org/', 'web/packages/available_packages_by_name.html'), which = 1)
head(res)
v <- Corpus(VectorSource(res$V2)); v
inspect(head(v, 3))
meta(v[[1]])
writeLines(as.character(v[[1]]))
lapply(v[1:5], as.character)

# Cleaning the corpus
getTransformations()
stopwords("english")
removeWords('to be or not to be', stopwords("english"))
v <- tm_map(v, removeWords, stopwords("english"))
inspect(head(v, 3))

v <- tm_map(v, content_transformer(tolower))
v <- tm_map(v, removePunctuation)
v <- tm_map(v, stripWhitespace)
inspect(head(v, 3))

# Visualizing the most frequent words in the corpus
wordcloud::wordcloud(v)

# Further cleanup
v <- tm_map(v, removeNumbers)
tdm <- TermDocumentMatrix(v)
inspect(tdm[1:5, 1:20])
findFreqTerms(tdm, lowfreq = 100)

myStopwords <- c('package', 'based', 'using')
v <- tm_map(v, removeWords, myStopwords)

# Stemming words
library(SnowballC)
wordStem(c('cats', 'mastering', 'modelling', 'models', 'model'))
wordStem(c('are', 'analyst', 'analyze', 'analysis'))
d <- v
v <- tm_map(v, stemDocument, language = "english")
v <- tm_map(v, content_transformer(function(x, d) {
paste(stemCompletion(strsplit(stemDocument(x), ' ')[[1]],d),collapse = ' ')
}), d)

tdm <- TermDocumentMatrix(v)
findFreqTerms(tdm, lowfreq = 100)


# Lemmatisation
Remove characters from the end of words in the hope of finding the stem, which is heuristic process often resulting in not-existing words. We tried to overcome this issue by comleting the stems to the shortest meaningful words by using a dictionary, which might result in derivation in the meaning of the term.
Another way to reduce the number of inflectional forms of different terms, instead of deconstruction and then trying to rebuidl the words, is morphological analysis with the help of a dictionary. This process is called lemmatisation, which looks for lemma (the canonical form of a word) instead of stems.

# Analyzing the associations among terms
findAssocs(tdm, 'data', 0.1)

# Some other metrics
vnchar <- sapply(v, function(x) nchar(x$content))
summary(vnchar)
(vm <- which.min(vnchar))
v[[vm]]
hist(vnchar, main = 'Length of R package descriptions', xlab = 'Number of characters')

# The segmentation of documents
hadleyverse <- c('ggplot2', 'dplyr', 'reshape2', 'lubridate','stringr', 'devtools', 'roxygen2', 'tidyr')
(w <- which(res$V1 %in% hadleyverse))
plot(hclust(dist(DocumentTermMatrix(v[w]))), xlab = 'Hadleyverse packages')
sapply(v[w], function(x) structure(content(x), .Names = meta(x, 'id')))

Tuesday, February 2, 2016

Master R 6 - Beyond the linear trend line

The modeling workflow
1 First, fit the model with the main predictors and all the relevant confounders, and then reduce the number of confounders by dropping out the non-significant ones. 
2 Decide whether to use the continuous variables in their original or categorized form.
3 Try to achieve a better fit by testing for non-linear relationships, if they are pragmaticaly relevant.
4 Finally check the model assumptions.

Logistic regression
library(catdata)
data(deathpenalty)
library(vcdExtra)
deathpenalty.expand <- expand.dft(deathpenalty)
binom.model.0 <- glm(DeathPenalty ~ DefendantRace, data = deathpenalty.expand, family = binomial)
summary(binom.model.0)
exp(cbind(OR = coef(binom.model.0), confint(binom.model.0)))

binom.model.1 <- update(binom.model.0, . ~ . + VictimRace)
summary(binom.model.1)
exp(cbind(OR = coef(binom.model.1), confint(binom.model.1)))

prop.table(table(factor(deathpenalty.expand$VictimRace,labels = c("VictimRace=0", "VictimRace=1")),factor(deathpenalty.expand$DefendantRace, labels = c("DefendantRace=0", "DefendantRace=1"))), 1)

Data consideration
A general rule of thumb, logistic regression models require at least 10 events per predictors.

Goodness of model fit
library(lmtest)
lrtest(binom.model.1)
library(BaylorEdPsych)
PseudoR2(binom.model.1)

Model comparison
lrtest(binom.model.0, binom.model.1)

Models for count data
Poisson regression
dfa <- readRDS('SMART_2013.RData')
(ct <- xtabs(~model+failure, data=dfa))
dfa <- dfa[dfa$model %in% names(which(rowSums(ct) - ct[, 1] > 0)),]

library(ggplot2)
ggplot(rbind(dfa, data.frame(model='All', dfa[, -1] )), aes(failure)) + ylab("log(count)") + 
geom_histogram(binwidth = 1, drop=TRUE, origin = -0.5)  + 
scale_y_log10() + scale_x_continuous(breaks=c(0:10)) + 
facet_wrap( ~ model, ncol = 3) + ggtitle("Histograms by manufacturer") + theme_bw()

poiss.base <- glm(failure ~ model, offset(log(freq)),  family = 'poisson', data = dfa)
summary(poiss.base)
contrasts(dfa$model, sparse = TRUE)
lrtest(poiss.base)

Negative binomial regression
library(MASS)
model.negbin.0 <- glm.nb(failure ~ model, offset(log(freq)), data = dfa)
lrtest(poiss.base,model.negbin.0)

Multivariate non-linear models
model.negbin.1 <- update(model.negbin.0, . ~ . + capacity_bytes + age_month + temperature)
model.negbin.2 <- update(model.negbin.1, . ~ . + PendingSector)
lrtest(model.negbin.0, model.negbin.1, model.negbin.2)
summary(model.negbin.2)
exp(data.frame(exp_coef = coef(model.negbin.2)))
dfa$model <- relevel(dfa$model, 'WDC')


model.negbin.3 <- update(model.negbin.2, data = dfa)
library(broom)
format(tidy(model.negbin.3), digits = 4)

library(data.table)
dfa <- data.table(dfa)
dfa[, temp6 := cut2(temperature, g = 6)]
temperature.weighted.mean <- dfa[, .(wfailure = weighted.mean(failure, freq)), by = temp6] 
ggplot(temperature.weighted.mean, aes(x = temp6, y = wfailure)) +  
geom_bar(stat = 'identity') + xlab('Categorized temperature') + ylab('Weighted mean of disk faults') + theme_bw()

model.negbin.4 <- update(model.negbin.0, .~. + capacity_bytes + age_month + temp6 + PendingSector, data = dfa)
AIC(model.negbin.3,model.negbin.4)

weighted.means <- rbind(dfa[, .(l = 'capacity', f = weighted.mean(failure, freq)), by = .(v = capacity_bytes)], dfa[, .(l = 'age', f = weighted.mean(failure, freq)), by = .(v = age_month)])
ggplot(weighted.means, aes(x = l, y = f)) + geom_step() +
facet_grid(. ~ v, scales = 'free_x') + theme_bw() +
ylab('Weighted mean of disk faults') + xlab('')

dfa[, capacity_bytes := as.factor(capacity_bytes)]
dfa[, age8 := cut2(age_month, g = 8)]
model.negbin.5 <- update(model.negbin.0, .~. + capacity_bytes + age8 + temp6 + PendingSector, data = dfa)
AIC(model.negbin.5, model.negbin.4)
format(tidy(model.negbin.5), digits = 3)

tmnb5 <- tidy(model.negbin.5)
str(terms <- tmnb5$term[tmnb5$p.value < 0.05][-1])

library(plyr)
ci <- ldply(terms, function(t) confint(model.negbin.5, t))
names(ci) <- c('min', 'max')
ci$term <- terms
ci$variable <- sub('[A-Z0-9\\]\\[,() ]*$', '', terms, perl = TRUE)

ggplot(ci, aes(x = factor(term), color = variable)) + 
geom_errorbar(ymin = min, ymax = max) + xlab('') +
ylab('Coefficients (95% conf.int)') + theme_bw() + 
theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = 'top')
PseudoR2(model.negbin.6 )

Blog Archive