In [17]:
import pandas as pd
import numpy as np
In [18]:
name_1880= pd.read_csv('data2/yob1880.txt', names= ['name', 'gender', 'births'])
name_1880
Out[18]:
In [19]:
name_1880.groupby('gender').births.sum()
Out[19]:
In [20]:
years= range(1880,2011)
pieces= []
columns= ['name','gender','births']
for year in years:
path= 'data2/yob{}.txt.'.format(year)
frame= pd.read_csv(path, names=columns)
frame['year']= year
pieces.append(frame)
names= pd.concat(pieces, ignore_index= True)
In [21]:
names
Out[21]:
In [22]:
total_births= names.pivot_table('births', index='year', columns='gender', aggfunc=sum)
total_births.plot(title='Total births(gender / year)')
Out[22]:
In [64]:
def add_prop(group):
group['prop'] = group.births / group.births.sum()
return group
In [65]:
name= names.groupby(['year', 'gender']).apply(add_prop)
name
Out[65]:
Mary prop:0.077643은 1880년도에 태어난 여자 중에서 약 0.078%차지
In [66]:
names = names.groupby(['year','gender']).apply(add_prop)
names
Out[66]:
연도별 / 성별에 따른 선호하는 이름 1000개 추출¶
In [67]:
def get_top1000(group):
return group.sort_values(by='births', ascending=False)[:1000]
In [68]:
grouped= names.groupby(['year','gender'])
top1000= grouped.apply(get_top1000)
top1000
Out[68]:
In [69]:
top1000.reset_index(inplace=True, drop=True)
top1000
Out[69]:
상위 1000개의 이름데이터를 남자(boys)와 여자(grils)로 분리¶
In [70]:
boys= top1000[top1000.gender== 'M']
girls= top1000[top1000.gender== 'F']
- 연도와 출생수를 피봇테이블로 변환
In [71]:
total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)
total_births.info()
In [72]:
total_births.head()
Out[72]:
In [73]:
subset = total_births[['John','Harry','Mary','Alice']]
subset.plot(subplots=True,figsize =(12,10), grid=False,
title='Number of birth per year')
Out[73]:
In [74]:
import matplotlib.pyplot as plt
plt.figure()
table = top1000.pivot_table('prop', index = 'year',columns='gender',aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
Out[74]:
In [75]:
df = boys[boys.year == 2010]
df
Out[75]:
In [76]:
prop_cumsum = df.sort_values(by='prop',ascending=False).prop.cumsum()
prop_cumsum[:10]
prop_cumsum.values.searchsorted(0.5)+1
Out[76]:
가정: 흔한 이름 쓰는 걸 좋아하지 않는다.
In [77]:
df= boys[boys.year == 1910]
y1910= df.sort_values(by='prop', ascending= False).prop.cumsum()
y1910.values.searchsorted(0.5)+1
Out[77]:
In [78]:
def get_quantile_count(group, q=0.5):
group = group.sort_values(by='prop', ascending=False)
return group.prop.cumsum().values.searchsorted(q) + 1
diversity = top1000.groupby(['year', 'gender']).apply(get_quantile_count)
diversity = diversity.unstack('gender')
fig = plt.figure()
diversity.plot(title="Number of popular names in top 50%")
Out[78]:
마지막 글자의 변화¶
In [80]:
get_last_letter= lambda x: x[-1]
last_letters= names.name.map(get_last_letter)
last_letters.name= 'last_letter'
table = names.pivot_table('births', index=last_letters, columns=['gender', 'year'], aggfunc=sum)
In [81]:
subtable= table.reindex(columns=[1910,1960,2010], level='year')
subtable.head()
Out[81]:
In [83]:
subtable.sum()
letter_prop= subtable / subtable.sum()
letter_prop
Out[83]:
In [84]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 1, figsize=(15, 11))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
legend=False)
Out[84]:
In [85]:
letter_prop= table / table.sum()
dny_ts= letter_prop.loc[['d','n','y'], 'M'].T
dny_ts.head()
Out[85]:
In [86]:
plt.close()
fig= plt.figure()
dny_ts.plot()
Out[86]:
남자이름>여자이름
Lesley or Leslie
공통부분: Lesl
In [89]:
all_names = pd.Series(top1000.name.unique())
lesley_like = all_names[all_names.str.lower().str.contains('lesl')]
lesley_like
Out[89]:
In [91]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()
Out[91]:
In [92]:
table = filtered.pivot_table('births', index='year',
columns='gender', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()
Out[92]:
In [93]:
fig = plt.figure()
table.plot(style={'M': 'k-', 'F': 'b--'})
Out[93]:
'Python' 카테고리의 다른 글
마크다운 요약 (0) | 2020.09.29 |
---|---|
Python_우리나라 아기이름 데이터 분석 (1) | 2020.09.15 |
Python_판다스_데이터분석 (0) | 2020.09.14 |
Python_example (0) | 2020.09.11 |
python_pandas(판다스): 계층적 색인 지정, 누락된 데이터처리, 결측치채우기, 데이터 변형하기, onehot인코딩 (0) | 2020.09.11 |