200915
In [17]:
import pandas as pd 
import numpy as np
In [18]:
name_1880= pd.read_csv('data2/yob1880.txt', names= ['name', 'gender', 'births'])
name_1880
Out[18]:
name gender births
0 Mary F 7065
1 Anna F 2604
2 Emma F 2003
3 Elizabeth F 1939
4 Minnie F 1746
... ... ... ...
1995 Woodie M 5
1996 Worthy M 5
1997 Wright M 5
1998 York M 5
1999 Zachariah M 5

2000 rows × 3 columns

In [19]:
name_1880.groupby('gender').births.sum()
Out[19]:
gender
F     90993
M    110493
Name: births, dtype: int64
In [20]:
years= range(1880,2011)
pieces= []
columns= ['name','gender','births']

for year in years:
    path= 'data2/yob{}.txt.'.format(year)
    frame= pd.read_csv(path, names=columns)
    frame['year']= year
    pieces.append(frame)
names= pd.concat(pieces, ignore_index= True)
In [21]:
names
Out[21]:
name gender births year
0 Mary F 7065 1880
1 Anna F 2604 1880
2 Emma F 2003 1880
3 Elizabeth F 1939 1880
4 Minnie F 1746 1880
... ... ... ... ...
1690779 Zymaire M 5 2010
1690780 Zyonne M 5 2010
1690781 Zyquarius M 5 2010
1690782 Zyran M 5 2010
1690783 Zzyzx M 5 2010

1690784 rows × 4 columns

In [22]:
total_births= names.pivot_table('births', index='year', columns='gender', aggfunc=sum)
total_births.plot(title='Total births(gender / year)')
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x207c061d7f0>
In [64]:
def add_prop(group):
    group['prop'] = group.births / group.births.sum()
    return group
In [65]:
name= names.groupby(['year', 'gender']).apply(add_prop)
name
Out[65]:
name gender births year prop
0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
... ... ... ... ... ...
1690779 Zymaire M 5 2010 0.000003
1690780 Zyonne M 5 2010 0.000003
1690781 Zyquarius M 5 2010 0.000003
1690782 Zyran M 5 2010 0.000003
1690783 Zzyzx M 5 2010 0.000003

1690784 rows × 5 columns

Mary prop:0.077643은 1880년도에 태어난 여자 중에서 약 0.078%차지

In [66]:
names = names.groupby(['year','gender']).apply(add_prop)
names
Out[66]:
name gender births year prop
0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
... ... ... ... ... ...
1690779 Zymaire M 5 2010 0.000003
1690780 Zyonne M 5 2010 0.000003
1690781 Zyquarius M 5 2010 0.000003
1690782 Zyran M 5 2010 0.000003
1690783 Zzyzx M 5 2010 0.000003

1690784 rows × 5 columns

연도별 / 성별에 따른 선호하는 이름 1000개 추출

In [67]:
def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]
In [68]:
grouped= names.groupby(['year','gender'])
top1000= grouped.apply(get_top1000)
top1000
Out[68]:
name gender births year prop
year gender
1880 F 0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
... ... ... ... ... ... ... ...
2010 M 1677639 Camilo M 194 2010 0.000102
1677640 Destin M 194 2010 0.000102
1677641 Jaquan M 194 2010 0.000102
1677642 Jaydan M 194 2010 0.000102
1677645 Maxton M 193 2010 0.000102

261877 rows × 5 columns

In [69]:
top1000.reset_index(inplace=True, drop=True)
top1000
Out[69]:
name gender births year prop
0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
... ... ... ... ... ...
261872 Camilo M 194 2010 0.000102
261873 Destin M 194 2010 0.000102
261874 Jaquan M 194 2010 0.000102
261875 Jaydan M 194 2010 0.000102
261876 Maxton M 193 2010 0.000102

261877 rows × 5 columns

상위 1000개의 이름데이터를 남자(boys)와 여자(grils)로 분리

In [70]:
boys= top1000[top1000.gender== 'M']
girls= top1000[top1000.gender== 'F']
  • 연도와 출생수를 피봇테이블로 변환
In [71]:
total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)
total_births.info() 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 131 entries, 1880 to 2010
Columns: 6868 entries, Aaden to Zuri
dtypes: float64(6868)
memory usage: 6.9 MB
In [72]:
total_births.head()
Out[72]:
name Aaden Aaliyah Aarav Aaron Aarush Ab Abagail Abb Abbey Abbie ... Zoa Zoe Zoey Zoie Zola Zollie Zona Zora Zula Zuri
year
1880 NaN NaN NaN 102.0 NaN NaN NaN NaN NaN 71.0 ... 8.0 23.0 NaN NaN 7.0 NaN 8.0 28.0 27.0 NaN
1881 NaN NaN NaN 94.0 NaN NaN NaN NaN NaN 81.0 ... NaN 22.0 NaN NaN 10.0 NaN 9.0 21.0 27.0 NaN
1882 NaN NaN NaN 85.0 NaN NaN NaN NaN NaN 80.0 ... 8.0 25.0 NaN NaN 9.0 NaN 17.0 32.0 21.0 NaN
1883 NaN NaN NaN 105.0 NaN NaN NaN NaN NaN 79.0 ... NaN 23.0 NaN NaN 10.0 NaN 11.0 35.0 25.0 NaN
1884 NaN NaN NaN 97.0 NaN NaN NaN NaN NaN 98.0 ... 13.0 31.0 NaN NaN 14.0 6.0 8.0 58.0 27.0 NaN

5 rows × 6868 columns

In [73]:
subset = total_births[['John','Harry','Mary','Alice']]
subset.plot(subplots=True,figsize =(12,10), grid=False,
           title='Number of birth per year')
Out[73]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x00000207B9D5A370>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x00000207B3CD0190>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x00000207B40C83D0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x00000207B40F5550>],
      dtype=object)
In [74]:
import matplotlib.pyplot as plt
plt.figure()
table = top1000.pivot_table('prop', index = 'year',columns='gender',aggfunc=sum)

table.plot(title='Sum of table1000.prop by year and sex',
           yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
Out[74]:
<matplotlib.axes._subplots.AxesSubplot at 0x207c293fe20>
<Figure size 432x288 with 0 Axes>
In [75]:
df = boys[boys.year == 2010]
df
Out[75]:
name gender births year prop
260877 Jacob M 21875 2010 0.011523
260878 Ethan M 17866 2010 0.009411
260879 Michael M 17133 2010 0.009025
260880 Jayden M 17030 2010 0.008971
260881 William M 16870 2010 0.008887
... ... ... ... ... ...
261872 Camilo M 194 2010 0.000102
261873 Destin M 194 2010 0.000102
261874 Jaquan M 194 2010 0.000102
261875 Jaydan M 194 2010 0.000102
261876 Maxton M 193 2010 0.000102

1000 rows × 5 columns

In [76]:
prop_cumsum = df.sort_values(by='prop',ascending=False).prop.cumsum()
prop_cumsum[:10]
prop_cumsum.values.searchsorted(0.5)+1
Out[76]:
117

가정: 흔한 이름 쓰는 걸 좋아하지 않는다.

In [77]:
df= boys[boys.year == 1910] 
y1910= df.sort_values(by='prop', ascending= False).prop.cumsum()
y1910.values.searchsorted(0.5)+1
Out[77]:
31
In [78]:
def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

diversity = top1000.groupby(['year', 'gender']).apply(get_quantile_count)
diversity = diversity.unstack('gender')
fig = plt.figure()
diversity.plot(title="Number of popular names in top 50%")
Out[78]:
<matplotlib.axes._subplots.AxesSubplot at 0x207b4679f40>
<Figure size 432x288 with 0 Axes>

마지막 글자의 변화

In [80]:
get_last_letter= lambda x: x[-1]
last_letters= names.name.map(get_last_letter)
last_letters.name= 'last_letter'

table = names.pivot_table('births', index=last_letters, columns=['gender', 'year'], aggfunc=sum)
In [81]:
subtable= table.reindex(columns=[1910,1960,2010], level='year')
subtable.head()
Out[81]:
gender F M
year 1910 1960 2010 1910 1960 2010
last_letter
a 108376.0 691247.0 670605.0 977.0 5204.0 28438.0
b NaN 694.0 450.0 411.0 3912.0 38859.0
c 5.0 49.0 946.0 482.0 15476.0 23125.0
d 6750.0 3729.0 2607.0 22111.0 262112.0 44398.0
e 133569.0 435013.0 313833.0 28655.0 178823.0 129012.0
In [83]:
subtable.sum()
letter_prop= subtable / subtable.sum()
letter_prop
Out[83]:
gender F M
year 1910 1960 2010 1910 1960 2010
last_letter
a 0.273390 0.341853 0.381240 0.005031 0.002440 0.014980
b NaN 0.000343 0.000256 0.002116 0.001834 0.020470
c 0.000013 0.000024 0.000538 0.002482 0.007257 0.012181
d 0.017028 0.001844 0.001482 0.113858 0.122908 0.023387
e 0.336941 0.215133 0.178415 0.147556 0.083853 0.067959
f NaN 0.000010 0.000055 0.000783 0.004325 0.001188
g 0.000144 0.000157 0.000374 0.002250 0.009488 0.001404
h 0.051529 0.036224 0.075852 0.045562 0.037907 0.051670
i 0.001526 0.039965 0.031734 0.000844 0.000603 0.022628
j NaN NaN 0.000090 NaN NaN 0.000769
k 0.000121 0.000156 0.000356 0.036581 0.049384 0.018541
l 0.043189 0.033867 0.026356 0.065016 0.104904 0.070367
m 0.001201 0.008613 0.002588 0.058044 0.033827 0.024657
n 0.079240 0.130687 0.140210 0.143415 0.152522 0.362771
o 0.001660 0.002439 0.001243 0.017065 0.012829 0.042681
p 0.000018 0.000023 0.000020 0.003172 0.005675 0.001269
q NaN NaN 0.000030 NaN NaN 0.000180
r 0.013390 0.006764 0.018025 0.064481 0.031034 0.087477
s 0.039042 0.012764 0.013332 0.130815 0.102730 0.065145
t 0.027438 0.015201 0.007830 0.072879 0.065655 0.022861
u 0.000684 0.000574 0.000417 0.000124 0.000057 0.001221
v NaN 0.000060 0.000117 0.000113 0.000037 0.001434
w 0.000020 0.000031 0.001182 0.006329 0.007711 0.016148
x 0.000015 0.000037 0.000727 0.003965 0.001851 0.008614
y 0.110972 0.152569 0.116828 0.077349 0.160987 0.058168
z 0.002439 0.000659 0.000704 0.000170 0.000184 0.001831
In [84]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(15, 11))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
                      legend=False)
Out[84]:
<matplotlib.axes._subplots.AxesSubplot at 0x207b4855b80>
In [85]:
letter_prop= table / table.sum()
dny_ts= letter_prop.loc[['d','n','y'], 'M'].T 
dny_ts.head()
Out[85]:
last_letter d n y
year
1880 0.083055 0.153213 0.075760
1881 0.083247 0.153214 0.077451
1882 0.085340 0.149560 0.077537
1883 0.084066 0.151646 0.079144
1884 0.086120 0.149915 0.080405
In [86]:
plt.close()
fig= plt.figure()
dny_ts.plot()
Out[86]:
<matplotlib.axes._subplots.AxesSubplot at 0x207c0ec6d90>
<Figure size 432x288 with 0 Axes>

남자이름>여자이름 Lesley or Leslie
공통부분: Lesl

In [89]:
all_names = pd.Series(top1000.name.unique())
lesley_like = all_names[all_names.str.lower().str.contains('lesl')]
lesley_like
Out[89]:
632     Leslie
2294    Lesley
4262    Leslee
4728     Lesli
6103     Lesly
dtype: object
In [91]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()
Out[91]:
name
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births, dtype: int64
In [92]:
table = filtered.pivot_table('births', index='year',
                             columns='gender', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()
Out[92]:
gender F M
year
2006 1.0 NaN
2007 1.0 NaN
2008 1.0 NaN
2009 1.0 NaN
2010 1.0 NaN
In [93]:
fig = plt.figure()
table.plot(style={'M': 'k-', 'F': 'b--'})
Out[93]:
<matplotlib.axes._subplots.AxesSubplot at 0x207c70116d0>
<Figure size 432x288 with 0 Axes>

+ Recent posts