In [118]:
import pandas as pd
import numpy as np
import matplotlib.font_manager as fm
font_path = 'C:/Windows/Fonts/NGULIM.TTF'
fontprop = fm.FontProperties(fname=font_path, size=15)
font_family = fm.FontProperties(fname=font_path).get_name()
plt.rcParams["font.family"] = font_family
In [119]:
years = range(2008,2021)
pieces = [] #전체 연도의 리스트를 합칠 것
columns = ['name','gender','births']
for year in years:
path = 'korea/y{}.txt'.format(year)
frame = pd.read_csv(path,names=columns)
frame['year']=year
pieces.append(frame)
names = pd.concat(pieces,ignore_index=True)
In [120]:
names
Out[120]:
In [150]:
total_births = names.pivot_table('births', index='year',columns='gender',aggfunc=sum)
total_births
Out[150]:
In [121]:
total_births = names.pivot_table('births', index='year',columns='gender',aggfunc=sum)
total_births.plot(title='total births(gender/year)')
Out[121]:
In [122]:
def add_prop(group):
group['prop'] = group.births / group.births.sum()
return group
In [123]:
names = names.groupby(['year','gender']).apply(add_prop)
names
Out[123]:
In [124]:
names.groupby(['year','gender']).prop.sum()
Out[124]:
연도별/성별에 따른 선호하는 이름 100개 추출¶
In [161]:
names
Out[161]:
In [125]:
def get_top100(group):
return group.sort_values(by='births',ascending=False)[:100]
In [159]:
grouped = names.groupby(['year','gender'])
top100 = grouped.apply(get_top100)
top100
Out[159]:
In [160]:
top100.reset_index(inplace = True , drop = True)
top100
Out[160]:
상위 100개의 이름데이터를 남자(boys)와 여자(girls)로 분리¶
In [163]:
boys = top100[top100.gender == 'M']
girls = top100[top100.gender == 'F']
In [164]:
total_births = top100.pivot_table('births',index ='year',columns='name',aggfunc=sum)
total_births.info()
In [165]:
total_births
Out[165]:
In [166]:
total_births[['민준','하준','서연','지우']]
Out[166]:
In [130]:
subset = total_births[['민준','하준','서연','지우']]
subset.plot(subplots=True,figsize=(12,10),grid=False,title='Number of birth per year')
Out[130]:
In [168]:
table = top100.pivot_table('prop', index = 'year',columns='gender',aggfunc=sum)
table
Out[168]:
In [173]:
import matplotlib.pyplot as plt
plt.figure()
table = top100.pivot_table('prop', index = 'year',columns='gender',aggfunc=sum)
table.plot(title='Sum of table100.prop by year and sex',
yticks=np.linspace(0.5, 0.7, 5), xticks=range(2008, 2021, 1))
Out[173]:
In [179]:
df = boys[boys.year == 2010]
df
Out[179]:
In [185]:
prop_cumsum = df.sort_values(by='prop',ascending=False).prop.cumsum()
prop_cumsum.values.searchsorted(0.5)+1
Out[185]:
In [188]:
prop_cumsum[:10]
Out[188]:
In [177]:
prop_cumsum.values
Out[177]:
In [189]:
df = boys[boys.year == 2020]
y2020 = df.sort_values(by='prop',ascending=False).prop.cumsum()
y2020.values.searchsorted(0.5)+1
Out[189]:
In [136]:
def get_quantile_count(group, q=0.5):
group = group.sort_values(by='prop', ascending=False)
return group.prop.cumsum().values.searchsorted(q) + 1
diversity = top100.groupby(['year', 'gender']).apply(get_quantile_count)
diversity = diversity.unstack('gender')
fig = plt.figure()
diversity.plot(title="Number of popular names in top 50%")
Out[136]:
마지막 글자의 변화
In [137]:
get_last_letter= lambda x: x[-1]
last_letters= names.name.map(get_last_letter)
last_letters.name= 'last_letter'
table = names.pivot_table('births', index=last_letters, columns=['gender', 'year'], aggfunc=sum)
In [138]:
subtable= table.reindex(columns=[2008,2010,2020], level='year')
subtable.head()
Out[138]:
In [139]:
subtable.sum()
letter_prop= subtable / subtable.sum()
letter_prop
Out[139]:
In [140]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 1, figsize=(30, 11))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
legend=False)
Out[140]:
In [141]:
letter_prop= table / table.sum()
dny_ts= letter_prop.loc[['우','준', '현'], 'M'].T
dny_ts.head()
Out[141]:
In [142]:
plt.close()
fig= plt.figure()
dny_ts.plot()
Out[142]:
남자이름>여자이름
공통부분: 진
In [144]:
all_names = pd.Series(top100.name.unique())
jin_like = all_names[all_names.str.lower().str.contains('진')]
jin_like
Out[144]:
In [145]:
filtered = top100[top100.name.isin(jin_like)]
filtered.groupby('name').births.sum()
Out[145]:
In [146]:
table = filtered.pivot_table('births', index='year',
columns='gender', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()
Out[146]:
In [147]:
fig = plt.figure()
table.plot(style={'M': 'k-', 'F': 'b--'})
Out[147]:
In [ ]:
'Python' 카테고리의 다른 글
Seaborn, Matplotlib (0) | 2020.10.04 |
---|---|
마크다운 요약 (0) | 2020.09.29 |
Python_데이터분석2 (0) | 2020.09.15 |
Python_판다스_데이터분석 (0) | 2020.09.14 |
Python_example (0) | 2020.09.11 |