My DS Coding Bolg: Python Data Analysis 5 - pandas: Reading and Writing Data

I/O API Tools

-- CSV and textual Files

read_csv

read_table

to_csv

-- Reading Data in CSV or Text Files

csvframe = read_csv(ímyCSV_01.csví)

csvframe

read_table(ích05_01.csví,sep=í,í)

read_csv(ích05_02.csví)

read_csv(ích05_02.csví, header=None)

read_csv(ích05_02.csví, names=[íwhiteí,íredí,íblueí,ígreení,íanimalí])

read_csv(ích05_03.csví, index_col=[ícolorí,ístatusí])

-- Using RegExp for Parsing TXT Files

read_table(ích05_04.txtí,sep=í\s*í)

read_table(ích05_05.txtí,sep=í\D*í,header=None)

read_table(ích05_06.txtí,sep=í,í,skiprows=[0,1,3,6])

-- Reading TXT Files into Parts or Partially

read_csv(ích05_02.csví,skiprows=[2],nrows=3,header=None)

out = Series()

i=0

pieces = read_csv(ích05_01.csví,chunksize=3)

for piece in pieces:

out.set_value(i,piece[íwhiteí].sum())

i = i + 1

-- Writing Data in CSV

frame2

frame2.to_csv('ch05_07.csv')

frame2.to_csv(ích05_07b.csví, index=False, header=False)

frame3

frame3.to_csv(ích05_09.csví, na_rep =íNaNí)

-- Reading and Writing HTML Files

read_html

to_html

-- Writing Data in HTML

frame = pd.DataFrame(np.arange(4).reshape(2,2))

print(frame.to_html())

frame = pd.DataFrame( np.random.random((4,4)),

index = [íwhiteí,íblackí,íredí,íblueí],

columns = [íupí,ídowní,írightí,íleftí])

s = [í<HTML>í]

s.append(í<HEAD><TITLE>My DataFrame</TITLE></HEAD>í)

s.append(í<BODY>í)

s.append(frame.to_html())

s.append(í</BODY></HTML>í)

html = íí.join(s)

html_file = open(ímyFrame.htmlí,íwí)

html_file.write(html)

html_file.close()

-- Reading Data from an HTML File

web_frames = pd.read_html(ímyFrame.htmlí)

web_frames[0]

ranking = pd.read_html(íhttp://www.meccanismocomplesso.org/en/meccanismo-complesso-sito-2/classifica-punteggio/í)

ranking[0]

-- Reading Data from XML

from lxml import objectify

xml = objectify.parse(íbooks.xmlí)

xml

root = xml.getroot()

root.Book.Author

root.Book.PublishDate

root.getchildren()

[child.tag for child in root.Book.getchildren()]

[child.text for child in root.Book.getchildren()]

def etree2df(root):

column_names = []

for i in range(0,len(root.getchildren()[0].getchildren())):

column_names.append(root.getchildren()[0].getchildren()[i].tag)

xml:frame = pd.DataFrame(columns=column_names)

for j in range(0, len(root.getchildren())):

obj = root.getchildren()[j].getchildren()

texts = []

for k in range(0, len(column_names)):

texts.append(obj[k].text)

row = dict(zip(column_names, texts))

row_s = pd.Series(row)

row_s.name = j

xml:frame = xml:frame.append(row_s)

return xml:frame

etree2df(root)

-- Reading and Writing Data on Microsoft Excel Files

to_excel()

read_excel()

pd.read_excel(ídata.xlsí)

pd.read_excel(ídata.xlsí,íSheet2í)

pd.read_excel(ídata.xlsí,1)

frame = pd.DataFrame(np.random.random((4,4)),

index = [íexp1í,íexp2í,íexp3í,íexp4í],

columns = [íJan2015í,íFab2015í,íMar2015í,íApr2005í])

frame

frame.to_excel('data2.xlsx')

-- JSON Data

read_json

to_json

frame = pd.DataFrame(np.arange(16).reshape(4,4),

index=[íwhiteí,íblackí,íredí,íblueí],

columns=[íupí,ídowní,írightí,íleftí])

frame.to_json(íframe.jsoní)

pd.read_json('frame.json')

from pandas.io.json import json_normalize

file = open(íbooks.jsoní,írí)

text = file.read()

text = json.loads(text)

json_normalize(text,íbooksí)

json_normalize(text2,íbooksí,[íwriterí,ínationalityí])

-- The Format HDF5

from pandas.io.pytables import HDFStore

frame = pd.DataFrame(np.arange(16).reshape(4,4),

index=[íwhiteí,íblackí,íredí,íblueí],

columns=[íupí,ídowní,írightí,íleftí])

store = HDFStore(ímydata.h5í)

store[íobj1í] = frame

frame2

store[íobj2í] = frame2

store

store['obj2']

-- PickleóPython Object Serialization

-- Serialize a Python Object with cPickle

import cPickle as pickle

data = { ícolorí: [íwhiteí,íredí], ívalueí: [5, 7]}

pickled_data = pickle.dumps(data)

print pickled_data

nframe = pickle.loads(pickled_data)

nframe

-- Pickling with pandas

frame = pd.DataFrame(np.arange(16).reshape(4,4), index = [íupí,ídowní,íleftí,írightí])

frame.to_pickle(íframe.pklí)

pd.read_pickle(íframe.pklí)

-- Interacting with Databases

from sqlalchemy import create_engine

engine = create_engine(ípostgresql://scott:tiger@localhost:5432/mydatabaseí)

engine = create_engine(ímysql+mysqldb://scott:tiger@localhost/fooí)

engine = create_engine(íoracle://scott:tiger@127.0.0.1:1521/sidnameí)

engine = create_engine(ímssql+pyodbc://mydsní)

engine = create_engine(ísqlite:///foo.dbí)

-- Loading and Writing Data with SQLite3

frame = pd.DataFrame( np.arange(20).reshape(4,5), columns=[íwhiteí,íredí,íblueí,íblackí,ígreení])

frame

engine = create_engine(ísqlite:///foo.dbí)

frame.to_sql(ícolorsí,engine)

pd.read_sql(ícolorsí,engine)

import sqlite3

query = """

CREATE TABLE test

(a VARCHAR(20), b VARCHAR(20),

c REAL, d INTEGER

);"""

con = sqlite3.connect(í:memory:í)

con.execute(query)

con.commit()

data = [(íwhiteí,íupí,1,3),

(íblackí,ídowní,2,8),

(ígreení,íupí,4,4),

(íredí,ídowní,5,5)]

stmt = "INSERT INTO test VALUES(?,?,?,?)"

con.executemany(stmt, data)

con.commit()

cursor = con.execute(íselect * from testí)

cursor

rows = cursor.fetchall()

rows

cursor.description

pd.DataFrame(rows, columns=zip(*cursor.description)[0])

-- Loading and Writing Data with PostgreSQL

pd.__version__

engine = create_engine(ípostgresql://postgres:password@localhost:5432/postgresí)

frame = pd.DataFrame(np.random.random((4,4)),

index=[íexp1í,íexp2í,íexp3í,íexp4í],

columns=[ífebí,ímarí,íaprí,ímayí]);

frame.to_sql(ídataframeí,engine)

psql -U postgres

pd.read_sql_table(ídataframeí,engine)

pd.read_sql_query(íSELECT index,apr,may FROM DATAFRAME WHERE apr > 0.5í,engine)

-- Reading and Writing Data with a NoSQL Database: MongoDB

mongod --dbpath C:\MongoDB_data

import pymongo

client = MongoClient(ílocalhostí,27017)

db = client.mydatabase

collection = db.mycollection

db[ímycollectioní]

collection

frame = pd.DataFrame( np.arange(20).reshape(4,5),

columns=[íwhiteí,íredí,íblueí,íblackí,ígreení])

frame

import json

record = json.loads(frame.T.to_json()).values()

record

collection.mydocument.insert(record)

cursor = collection[ímydocumentí].find()

dataframe = (list(cursor))

del dataframe[í_idí]

dataframe

Conclusions

My DS Coding Bolg

Monday, February 29, 2016

Python Data Analysis 5 - pandas: Reading and Writing Data

No comments:

Post a Comment

Blog Archive