Monday, February 29, 2016

Python Data Analysis 5 - pandas: Reading and Writing Data

I/O API Tools

-- CSV and textual Files

read_csv
read_table
to_csv

-- Reading Data in CSV or Text Files
csvframe = read_csv(ímyCSV_01.csví)
csvframe
read_table(ích05_01.csví,sep=í,í)
read_csv(ích05_02.csví)
read_csv(ích05_02.csví, header=None)
read_csv(ích05_02.csví, names=[íwhiteí,íredí,íblueí,ígreení,íanimalí])
read_csv(ích05_03.csví, index_col=[ícolorí,ístatusí])

-- Using RegExp for Parsing TXT Files
read_table(ích05_04.txtí,sep=í\s*í)
read_table(ích05_05.txtí,sep=í\D*í,header=None)
read_table(ích05_06.txtí,sep=í,í,skiprows=[0,1,3,6])

-- Reading TXT Files into Parts or Partially
read_csv(ích05_02.csví,skiprows=[2],nrows=3,header=None)
out = Series()
i=0
pieces = read_csv(ích05_01.csví,chunksize=3)
for piece in pieces:
    out.set_value(i,piece[íwhiteí].sum())
    i = i + 1

-- Writing Data in CSV
frame2
frame2.to_csv('ch05_07.csv')
frame2.to_csv(ích05_07b.csví, index=False, header=False)

frame3
frame3.to_csv(ích05_09.csví, na_rep =íNaNí)

-- Reading and Writing HTML Files
read_html
to_html

-- Writing Data in HTML
frame = pd.DataFrame(np.arange(4).reshape(2,2))
print(frame.to_html())
frame = pd.DataFrame( np.random.random((4,4)),
                    index = [íwhiteí,íblackí,íredí,íblueí],
                    columns = [íupí,ídowní,írightí,íleftí])
s = [í<HTML>í]
s.append(í<HEAD><TITLE>My DataFrame</TITLE></HEAD>í)
s.append(í<BODY>í)
s.append(frame.to_html())
s.append(í</BODY></HTML>í)
html = íí.join(s)
html_file = open(ímyFrame.htmlí,íwí)
html_file.write(html)
html_file.close()

-- Reading Data from an HTML File
web_frames = pd.read_html(ímyFrame.htmlí)
web_frames[0]
ranking = pd.read_html(íhttp://www.meccanismocomplesso.org/en/meccanismo-complesso-sito-2/classifica-punteggio/í)
ranking[0]

-- Reading Data from XML
from lxml import objectify
xml = objectify.parse(íbooks.xmlí)
xml
root = xml.getroot()
root.Book.Author
root.Book.PublishDate
root.getchildren()
[child.tag for child in root.Book.getchildren()]
[child.text for child in root.Book.getchildren()]

def etree2df(root):
    column_names = []
    for i in range(0,len(root.getchildren()[0].getchildren())):
       column_names.append(root.getchildren()[0].getchildren()[i].tag)
    xml:frame = pd.DataFrame(columns=column_names)
    for j in range(0, len(root.getchildren())):
       obj = root.getchildren()[j].getchildren()
       texts = []
       for k in range(0, len(column_names)):
          texts.append(obj[k].text)
       row = dict(zip(column_names, texts))
       row_s = pd.Series(row)
       row_s.name = j
       xml:frame = xml:frame.append(row_s)
    return xml:frame

etree2df(root)

-- Reading and Writing Data on Microsoft Excel Files
to_excel()
read_excel()
pd.read_excel(ídata.xlsí)
pd.read_excel(ídata.xlsí,íSheet2í)
pd.read_excel(ídata.xlsí,1)
frame = pd.DataFrame(np.random.random((4,4)),
                      index = [íexp1í,íexp2í,íexp3í,íexp4í],
                      columns = [íJan2015í,íFab2015í,íMar2015í,íApr2005í])
frame
frame.to_excel('data2.xlsx')

-- JSON Data

read_json
to_json
frame = pd.DataFrame(np.arange(16).reshape(4,4),
                      index=[íwhiteí,íblackí,íredí,íblueí],
                      columns=[íupí,ídowní,írightí,íleftí])
frame.to_json(íframe.jsoní)
pd.read_json('frame.json')

from pandas.io.json import json_normalize
file = open(íbooks.jsoní,írí)
text = file.read()
text = json.loads(text)
json_normalize(text,íbooksí)
json_normalize(text2,íbooksí,[íwriterí,ínationalityí])

-- The Format HDF5
from pandas.io.pytables import HDFStore
frame = pd.DataFrame(np.arange(16).reshape(4,4),
                      index=[íwhiteí,íblackí,íredí,íblueí],
                      columns=[íupí,ídowní,írightí,íleftí])
store = HDFStore(ímydata.h5í)
store[íobj1í] = frame
frame2
store[íobj2í] = frame2

store
store['obj2']

-- PickleóPython Object Serialization

-- Serialize a Python Object with cPickle

import cPickle as pickle
data = { ícolorí: [íwhiteí,íredí], ívalueí: [5, 7]}
pickled_data = pickle.dumps(data)
print pickled_data
nframe = pickle.loads(pickled_data)
nframe

-- Pickling with pandas

frame = pd.DataFrame(np.arange(16).reshape(4,4), index = [íupí,ídowní,íleftí,írightí])
frame.to_pickle(íframe.pklí)

pd.read_pickle(íframe.pklí)

-- Interacting with Databases
from sqlalchemy import create_engine

engine = create_engine(ípostgresql://scott:tiger@localhost:5432/mydatabaseí)
engine = create_engine(ímysql+mysqldb://scott:tiger@localhost/fooí)
engine = create_engine(íoracle://scott:tiger@127.0.0.1:1521/sidnameí)
engine = create_engine(ímssql+pyodbc://mydsní)
engine = create_engine(ísqlite:///foo.dbí)

-- Loading and Writing Data with SQLite3
frame = pd.DataFrame( np.arange(20).reshape(4,5), columns=[íwhiteí,íredí,íblueí,íblackí,ígreení])
frame
engine = create_engine(ísqlite:///foo.dbí)
frame.to_sql(ícolorsí,engine)
pd.read_sql(ícolorsí,engine)
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
c REAL,        d INTEGER
);"""
con = sqlite3.connect(í:memory:í)
con.execute(query)
con.commit()

data = [(íwhiteí,íupí,1,3),
         (íblackí,ídowní,2,8),
         (ígreení,íupí,4,4),
         (íredí,ídowní,5,5)]
stmt = "INSERT INTO test VALUES(?,?,?,?)"
con.executemany(stmt, data)
con.commit()

cursor = con.execute(íselect * from testí)
cursor
rows = cursor.fetchall()
rows

cursor.description
pd.DataFrame(rows, columns=zip(*cursor.description)[0])

-- Loading and Writing Data with PostgreSQL
pd.__version__
engine = create_engine(ípostgresql://postgres:password@localhost:5432/postgresí)
frame = pd.DataFrame(np.random.random((4,4)),
              index=[íexp1í,íexp2í,íexp3í,íexp4í],
              columns=[ífebí,ímarí,íaprí,ímayí]);
frame.to_sql(ídataframeí,engine)
psql -U postgres
pd.read_sql_table(ídataframeí,engine)
pd.read_sql_query(íSELECT index,apr,may FROM DATAFRAME WHERE apr > 0.5í,engine)

-- Reading and Writing Data with a NoSQL Database: MongoDB
mongod --dbpath C:\MongoDB_data
import pymongo
client = MongoClient(ílocalhostí,27017)
db = client.mydatabase
db
collection = db.mycollection
db[ímycollectioní]
collection
frame = pd.DataFrame( np.arange(20).reshape(4,5),
                       columns=[íwhiteí,íredí,íblueí,íblackí,ígreení])
frame

import json
record = json.loads(frame.T.to_json()).values()
record
collection.mydocument.insert(record)

cursor = collection[ímydocumentí].find()
dataframe = (list(cursor))
del dataframe[í_idí]
dataframe


Conclusions

No comments:

Post a Comment

Blog Archive