My DS Coding Bolg: Python Data Analysis 6 - pandas in Depth: Data Manipulation

-- Data Preparation

assembling

- merging pandas.merge()

- concatenating pandas.concat()

- combining pandas.DataFrame.combine_first()

reshaping(pivoting)

removing

-- Merging

import numpy as np

import pandas as pd

frame1 = pd.DataFrame( {íidí:[íballí,ípencilí,ípení,ímugí,íashtrayí],

ípriceí: [12.33,11.44,33.21,13.23,33.62]})

frame1

frame2 = pd.DataFrame( {íidí:[ípencilí,ípencilí,íballí,ípení],

ícolorí: [íwhiteí,íredí,íredí,íblackí]})

frame2

pd.merge(frame1,frame2)

frame1 = pd.DataFrame( {íidí:[íballí,ípencilí,ípení,ímugí,íashtrayí],

ícolorí: [íwhiteí,íredí,íredí,íblackí,ígreení],

íbrandí: [íOMGí,íABCí,íABCí,íPODí,íPODí]})

frame1

frame2 = pd.DataFrame( {íidí:[ípencilí,ípencilí,íballí,ípení],

íbrandí: [íOMGí,íPODí,íABCí,íPODí]})

frame2

pd.merge(frame1,frame2)

pd.merge(frame1,frame2,on=íidí)

pd.merge(frame1,frame2,on=íbrandí)

frame2.columns = [íbrandí,ísidí]

frame2

pd.merge(frame1, frame2, left_on=íidí, right_on=ísidí)

frame2.columns[íbrandí,íidí]

pd.merge(frame1,frame2,on=íidí)

pd.merge(frame1,frame2,on=íidí,how=íouterí)

pd.merge(frame1,frame2,on=íidí,how=íleftí)

pd.merge(frame1,frame2,on=íidí,how=írightí)

pd.merge(frame1,frame2,on=[íidí,íbrandí],how=íouterí)

Merging on Index

pd.merge(frame1,frame2,right_index=True, left_index=True)

frame1.join(frame2)

frame2.columns = [íbrand2í,íid2í]

frame1.join(frame2)

Concatenating

array1

array2 = np.arange(9).reshape((3,3))+6

array2

np.concatenate([array1,array2],axis=1)

np.concatenate([array1,array2],axis=0)

ser1 = pd.Series(np.random.rand(4), index=[1,2,3,4])

ser1

ser2 = pd.Series(np.random.rand(4), index=[5,6,7,8])

ser2

pd.concat([ser1,ser2])

pd.concat([ser1,ser2],axis=1)

pd.concat([ser1,ser3],axis=1,join=íinnerí)

pd.concat([ser1,ser2], keys=[1,2])

pd.concat([ser1,ser2], axis=1, keys=[1,2])

frame1 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[1,2,3], columns=[íAí,íBí,íCí])

frame2 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[4,5,6], columns=[íAí,íBí,íCí])

pd.concat([frame1, frame2])

pd.concat([frame1, frame2], axis=1)

Combining

ser1 = pd.Series(np.random.rand(5),index=[1,2,3,4,5])

ser1

ser2 = pd.Series(np.random.rand(4),index=[2,4,5,6])

ser2

ser1.combine_first(ser2)

ser2.combine_first(ser1)

ser1[:3].combine_first(ser2[:3])

Pivoting

Pivoting with Hierarchical Indexing

frame1 = pd.DataFrame(np.arange(9).reshape(3,3),

index=[íwhiteí,íblackí,íredí],

columns=[íballí,ípení,ípencilí])

frame1

frame1.stack()

ser5.unstack()

ser5.unstack(0)

Pivoting from ìLongî to ìWideî Format

longframe = pd.DataFrame({ ícolorí:[íwhiteí,íwhiteí,íwhiteí,

íredí,íredí,íredí,

íblackí,íblackí,íblackí],

íitemí:[íballí,ípení,ímugí,

íballí,ípení,ímugí,

íballí,ípení,ímugí],

ívalueí: np.random.rand(9)})

longframe

wideframe = longframe.pivot(ícolorí,íitemí)

wideframe

-- Removing

frame1 = pd.DataFrame(np.arange(9).reshape(3,3),

index=[íwhiteí,íblackí,íredí],

columns=[íballí,ípení,ípencilí])

frame1

del frame1[íballí]

frame1

frame1.drop('white')

-- Data Transformation

-- Removing Duplicates

dframe = pd.DataFrame({ ícolorí: [íwhiteí,íwhiteí,íredí,íredí,íwhiteí],

ívalueí: [2,1,3,3,2]})

dframe

dframe.duplicated()

dframe[dframe.duplicated()]

-- Mapping

frame = pd.DataFrame({ íitemí:[íballí,ímugí,ípení,ípencilí,íashtrayí],

ícolorí:[íwhiteí,írossoí,íverdeí,íblackí,íyellowí],

ípriceí:[5.56,4.20,1.30,0.56,2.75]})

frame

newcolors = {

írossoí: íredí,

íverdeí: ígreení

}

frame.replace(newcolors)

ser = pd.Series([1,3,np.nan,4,6,np.nan,3])

ser

ser.replace(np.nan,0)

Adding Values via Mapping

frame = pd.DataFrame({ íitemí:[íballí,ímugí,ípení,ípencilí,íashtrayí],

ícolorí:[íwhiteí,íredí,ígreení,íblackí,íyellowí]})

frame

frame[ípriceí] = frame[íitemí].map(prices)

frame

Rename the Indexes of the Axes

frame

reindex = {

0: ífirstí,

1: ísecondí,

2: íthirdí,

3: ífourthí,

4: ífifthí}

frame.rename(reindex)

recolumn = {

íitemí:íobjectí,

ípriceí: ívalueí}

frame.rename(index=reindex, columns=recolumn)

frame.rename(index={1:ífirstí}, columns={íitemí:íobjectí})

frame.rename(columns={íitemí:íobjectí}, inplace=True)

frame

-- Discretization and Binning

results = [12,34,67,55,28,90,99,12,3,56,74,44,87,23,49,89,87]

bins = [0,25,50,75,100]

cat = pd.cut(results, bins)

cat

cat.levels

cat.labels

pd.value_counts(cat)

bin_names = [íunlikelyí,íless likelyí,ílikelyí,íhighly likelyí]

pd.cut(results, bins, labels=bin_names)

pd.cut(results, 5)

quintiles = pd.qcut(results, 5)

quintiles

pd.value_counts(quintiles)

-- Detecting and Filtering Outliers

randframe = pd.DataFrame(np.random.randn(1000,3))

randframe.describe()

randframe.std()

randframe[(np.abs(randframe) > (3*randframe.std())).any(1)]

-- Permutation

nframe = pd.DataFrame(np.arange(25).reshape(5,5))

nframe

new_order = np.random.permutation(5)

new_order

nframe.take(new_order)

new_order = [3,4,2]

nframe.take(new_order)

Random Sampling

sample = np.random.randint(0, len(nframe), size=3)

sample

nframe.take(sample)

String Manipulation

-- Built-in Methods for Manipulation of Strings

text = í16 Bolton Avenue , Bostoní

text.split(',')

tokens = [s.strip() for s in text.split(í,í)]

tokens

address, city = [s.strip() for s in text.split(í,í)]

address

city

address + í,í + city

strings = [íA+í,íAí,íA-í,íBí,íBBí,íBBBí,íC+í]

í;í.join(strings)

íBostoní in text

text.index(íBostoní)

text.find('Boston')

text.index(íNew Yorkí)

text.find(íNew Yorkí)

text.count(íeí)

text.count(íAvenueí)

text.replace(íAvenueí,íStreetí)

text.replace(í1í,íí)

Regular Expressions

import re

pattern matching

substitution

splitting

text = "This is an\t odd \n text!"

re.split(í\s+í, text)

regex.split(text)

text = íThis is my address: 16 Bolton Avenue, Bostoní

re.findall(íA\w+í,text)

re.findall(í[A,a]\w+í,text)

re.search(í[A,a]\w+í,text)

search = re.search(í[A,a]\w+í,text)

search.start()

search.end()

text[search.start():search.end()]

re.match(í[A,a]\w+í,text)

re.match(íT\w+í,text)

match = re.match(íT\w+í,text)

text[match.start():match.end()]

-- Data Aggregation

-- GroupBy

splitting

applying

combining

A Practical Example

frame = pd.DataFrame({ ícolorí: [íwhiteí,íredí,ígreení,íredí,ígreení],

íobjectí: [ípení,ípencilí,ípencilí,íashtrayí,ípení],

íprice1í : [5.56,4.20,1.30,0.56,2.75],

íprice2í : [4.75,4.12,1.60,0.75,3.15]})

frame

group = frame[íprice1í].groupby(frame[ícolorí])

group

-- Hierarchical Grouping

ggroup = frame[íprice1í].groupby([frame[ícolorí],frame[íobjectí]])

ggroup.groups

ggroup.sum()

frame[[íprice1í,íprice2í]].groupby(frame[ícolorí]).mean()

frame.groupby(frame[ícolorí]).mean()

Group Iteration

for name, group in frame.groupby(ícolorí):

print name

print group

Chain of Transformations

result1 = frame[íprice1í].groupby(frame[ícolorí]).mean()

type(result1)

result2 = frame.groupby(frame[ícolorí]).mean()

type(result2)

frame[íprice1í].groupby(frame[ícolorí]).mean()

frame.groupby(frame[ícolorí])[íprice1í].mean()

(frame.groupby(frame[ícolorí]).mean())[íprice1í]

means = frame.groupby(ícolorí).mean().add_prefix(ímean_í)

means

-- Functions on Groups

group = frame.groupby(ícolorí)

group[íprice1í].quantile(0.6)

def range(series):

return series.max() - series.min()

group[íprice1í].agg(range)

group[íprice1í].agg([ímeaní,ístdí,range])

-- Advanced Data Aggregation

frame = pd.DataFrame({ ícolorí:[íwhiteí,íredí,ígreení,íredí,ígreení],

íprice1í:[5.56,4.20,1.30,0.56,2.75],

íprice2í:[4.75,4.12,1.60,0.75,3.15]})

frame

sums = frame.groupby(ícolorí).sum().add_prefix(ítot_í)

sums

merge(frame,sums,left_on=ícolorí,right_index=True)

frame.groupby(ícolorí).transform(np.sum).add_prefix(ítot_í)

frame = DataFrame( { ícolorí:[íwhiteí,íblackí,íwhiteí,íwhiteí,íblackí,íblackí],

ístatusí:[íupí,íupí,ídowní,ídowní,ídowní,íupí],

ívalue1í:[12.33,14.55,22.34,27.84,23.40,18.33],

ívalue2í:[11.23,31.80,29.99,31.18,18.25,22.44]})

frame

frame.groupby([ícolorí,ístatusí]).apply( lambda x: x.max())

frame.rename(index=reindex, columns=recolumn)

temp = date_range(í1/1/2015í, periods=10, freq= íHí)

temp

timeseries = Series(np.random.rand(10), index=temp)

timeseries

timetable = DataFrame( {ídateí: temp, ívalue1í : np.random.rand(10),

ívalue2í : np.random.rand(10)})

timetable

timetable[ícatí] = [íupí,ídowní,íleftí,íleftí,íupí,íupí,ídowní,írightí,írightí,íupí]

timetable

-- Conclusion

My DS Coding Bolg

Tuesday, March 1, 2016

Python Data Analysis 6 - pandas in Depth: Data Manipulation

No comments:

Post a Comment

Blog Archive