-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPython for Data Analysis Chap 2.3.py
187 lines (115 loc) · 4.14 KB
/
Python for Data Analysis Chap 2.3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <codecell>
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# <codecell>
names1880 = pd.read_csv('pydata-book/ch02/names/yob1880.txt', names=['name','sex','births'])
names1880
# <codecell>
names1880.groupby('sex').births.sum()
# <codecell>
years = range(1880, 2010)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
path = 'pydata-book/ch02/names/yob%d.txt' % year # this is how to insert integers?
frame = pd.read_csv(path, names=columns)
frame['year'] = year
pieces.append(frame)
# end years-for
names = pd.concat(pieces, ignore_index=True) # concatenate list of frames into one frame
names
# <codecell>
total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum) # sum up 'births' values, for each (year, sex) pair
total_births.tail()
# <codecell>
total_births.plot(title='Total births by sex and year') # row (year) --> x axis, column (sex) --> lines, entries (birth sum) --> y axis. also automtically imports attribute names.
# <codecell>
def add_prop(group):
births = group.births.astype(float) # type cat for frame entries?
group['prop'] = births / births.sum()
return group
# end add_prop-def
names = names.groupby(['year', 'sex']).apply(add_prop)
names
# <codecell>
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1) # I still don't understand the groupby func....
# <codecell>
def get_top1000(group):
return group.sort_index(by='births', ascending=False)[:1000]
# end get_top1000-def
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
top1000
# <codecell>
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
total_births = top1000.pivot_table('births', rows='year', cols='name', aggfunc=sum)
total_births
# <codecell>
subset = total_births[['John', 'Harry', 'Mary','Marilyn']]
subset.plot(subplots=True, figsize=(12,10), grid=False, title="Number of births per year")
# <codecell>
table = top1000.pivot_table('prop', rows='year', cols='sex', aggfunc=sum) # create a new table, row index is year, column index is sec, each entry is a sum of proportaions
table.plot(title="Sum of table1000.prop by year and sex", yticks=np.linspace(0,1.2,13), xticks=range(1880,2020,10))
# <codecell>
df = boys[boys.year == 2009]
type(df)
# <codecell>
df
# <codecell>
df_sorted = df.sort_index(by='prop', ascending=False)
df_sorted
# <codecell>
prop_cumsum = df_sorted.prop.cumsum()
prop_cumsum[:5]
# <codecell>
# <codecell>
# <codecell>
# This is called anonymous function by lambda
get_last_letter = lambda x: x[-1] # works as like def-function
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'
table = names.pivot_table('births', rows=last_letters, cols=['sex', 'year'], aggfunc=sum)
# <codecell>
last_letters.head(5)
# <codecell>
last_letters.tail(5)
# <codecell>
subtable=table.reindex(columns=[1910, 1960, 2009], level='year')
subtable.head()
# <codecell>
letter_prop = subtable / subtable.sum().astype(float)
# <codecell>
fig, axes = plt.subplots(2, 1, figsize=(10,8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)
# hmm, we see that many boys' name end with 'n' rapidly increase these days!
# <codecell>
letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M']
dny_ts.head()
# note that rows and columns are inverted with Python textbook!!
# <codecell>
dny_ts.T.plot(style={'d':'-.', 'n':'-', 'y':':'}) # this is convenient! pandas.plot() is really smart!
# <codecell>
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like
# <codecell>
filtered = top1000[top1000.name.isin(lesley_like)]
# <codecell>
filtered.groupby('name').births.sum()
# <codecell>
table = filtered.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
# <codecell>
table = table.div(table.sum(1), axis=0) # normalized sum to one
table.tail()
# <codecell>
table.head()
# <codecell>
table.plot(style={'M':'b-', 'F':'r--'})
# <codecell>