200915_2
In [118]:
import pandas as pd
import numpy as np
import matplotlib.font_manager as fm
font_path = 'C:/Windows/Fonts/NGULIM.TTF'
fontprop = fm.FontProperties(fname=font_path, size=15)
font_family = fm.FontProperties(fname=font_path).get_name()

plt.rcParams["font.family"] = font_family
In [119]:
years = range(2008,2021)
pieces = [] #전체 연도의 리스트를 합칠 것
columns = ['name','gender','births']

for year in years:
    path = 'korea/y{}.txt'.format(year)
    frame = pd.read_csv(path,names=columns)
    frame['year']=year
    pieces.append(frame)

names = pd.concat(pieces,ignore_index=True)
In [120]:
names
Out[120]:
name gender births year
0 서연 F 3280 2008
1 민서 F 2873 2008
2 지민 F 2826 2008
3 서현 F 2606 2008
4 서윤 F 2484 2008
... ... ... ... ...
12995 민승 M 20 2020
12996 규담 M 20 2020
12997 영웅 M 20 2020
12998 재성 M 20 2020
12999 주빈 M 20 2020

13000 rows × 4 columns

In [150]:
total_births = names.pivot_table('births', index='year',columns='gender',aggfunc=sum)
total_births
Out[150]:
gender F M
year
2008 191282 169063
2009 186337 166347
2010 195842 177375
2011 199164 180055
2012 204024 185399
2013 189671 173103
2014 187089 174520
2015 190049 178228
2016 176951 165692
2017 154957 148076
2018 144814 137774
2019 133687 128559
2020 52482 50294
In [121]:
total_births = names.pivot_table('births', index='year',columns='gender',aggfunc=sum)
total_births.plot(title='total births(gender/year)')
Out[121]:
<matplotlib.axes._subplots.AxesSubplot at 0x19294539b50>
In [122]:
def add_prop(group):
    group['prop'] = group.births / group.births.sum()
    return group
In [123]:
names = names.groupby(['year','gender']).apply(add_prop)
names
Out[123]:
name gender births year prop
0 서연 F 3280 2008 0.017147
1 민서 F 2873 2008 0.015020
2 지민 F 2826 2008 0.014774
3 서현 F 2606 2008 0.013624
4 서윤 F 2484 2008 0.012986
... ... ... ... ... ...
12995 민승 M 20 2020 0.000398
12996 규담 M 20 2020 0.000398
12997 영웅 M 20 2020 0.000398
12998 재성 M 20 2020 0.000398
12999 주빈 M 20 2020 0.000398

13000 rows × 5 columns

In [124]:
names.groupby(['year','gender']).prop.sum()
Out[124]:
year  gender
2008  F         1.0
      M         1.0
2009  F         1.0
      M         1.0
2010  F         1.0
      M         1.0
2011  F         1.0
      M         1.0
2012  F         1.0
      M         1.0
2013  F         1.0
      M         1.0
2014  F         1.0
      M         1.0
2015  F         1.0
      M         1.0
2016  F         1.0
      M         1.0
2017  F         1.0
      M         1.0
2018  F         1.0
      M         1.0
2019  F         1.0
      M         1.0
2020  F         1.0
      M         1.0
Name: prop, dtype: float64

연도별/성별에 따른 선호하는 이름 100개 추출

In [161]:
names
Out[161]:
name gender births year prop
0 서연 F 3280 2008 0.017147
1 민서 F 2873 2008 0.015020
2 지민 F 2826 2008 0.014774
3 서현 F 2606 2008 0.013624
4 서윤 F 2484 2008 0.012986
... ... ... ... ... ...
12995 민승 M 20 2020 0.000398
12996 규담 M 20 2020 0.000398
12997 영웅 M 20 2020 0.000398
12998 재성 M 20 2020 0.000398
12999 주빈 M 20 2020 0.000398

13000 rows × 5 columns

In [125]:
def get_top100(group):
    return group.sort_values(by='births',ascending=False)[:100]
In [159]:
grouped = names.groupby(['year','gender'])
top100 = grouped.apply(get_top100)
top100
Out[159]:
name gender births year prop
year gender
2008 F 0 서연 F 3280 2008 0.017147
1 민서 F 2873 2008 0.015020
2 지민 F 2826 2008 0.014774
3 서현 F 2606 2008 0.013624
4 서윤 F 2484 2008 0.012986
... ... ... ... ... ... ... ...
2020 M 12597 승민 M 126 2020 0.002505
12598 시환 M 126 2020 0.002505
12595 재하 M 126 2020 0.002505
12596 태준 M 126 2020 0.002505
12599 서율 M 122 2020 0.002426

2600 rows × 5 columns

In [160]:
top100.reset_index(inplace = True , drop = True)
top100
Out[160]:
name gender births year prop
0 서연 F 3280 2008 0.017147
1 민서 F 2873 2008 0.015020
2 지민 F 2826 2008 0.014774
3 서현 F 2606 2008 0.013624
4 서윤 F 2484 2008 0.012986
... ... ... ... ... ...
2595 승민 M 126 2020 0.002505
2596 시환 M 126 2020 0.002505
2597 재하 M 126 2020 0.002505
2598 태준 M 126 2020 0.002505
2599 서율 M 122 2020 0.002426

2600 rows × 5 columns

상위 100개의 이름데이터를 남자(boys)와 여자(girls)로 분리

In [163]:
boys = top100[top100.gender == 'M']
girls = top100[top100.gender == 'F']
In [164]:
total_births = top100.pivot_table('births',index ='year',columns='name',aggfunc=sum)
total_births.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13 entries, 2008 to 2020
Columns: 321 entries, 가연 to 혜인
dtypes: float64(321)
memory usage: 32.7 KB
In [165]:
total_births
Out[165]:
name 가연 가영 가온 가윤 가은 가현 강민 건우 건호 ... 현수 현아 현우 현준 현지 현진 현호 형준 혜원 혜인
year
2008 772.0 737.0 NaN NaN 1571.0 728.0 NaN 532.0 1703.0 NaN ... 768.0 NaN 1924.0 1615.0 991.0 498.0 NaN 472.0 1007.0 NaN
2009 664.0 642.0 NaN NaN 1703.0 677.0 519.0 489.0 1784.0 NaN ... 687.0 NaN 1909.0 1691.0 891.0 NaN NaN NaN 914.0 NaN
2010 652.0 598.0 NaN 554.0 1651.0 680.0 527.0 NaN 1945.0 531.0 ... 701.0 576.0 2064.0 1989.0 822.0 NaN 494.0 NaN 1070.0 NaN
2011 NaN NaN NaN 734.0 1420.0 635.0 531.0 NaN 1746.0 NaN ... 583.0 586.0 2032.0 1785.0 687.0 NaN NaN NaN 966.0 646.0
2012 640.0 NaN NaN 720.0 1333.0 659.0 493.0 NaN 2050.0 NaN ... 517.0 NaN 2055.0 1560.0 NaN NaN NaN NaN 725.0 NaN
2013 NaN NaN NaN 782.0 1121.0 NaN 454.0 NaN 1577.0 NaN ... 468.0 NaN 1861.0 1426.0 NaN NaN NaN NaN 660.0 NaN
2014 NaN NaN NaN 600.0 1044.0 NaN NaN 485.0 1349.0 NaN ... NaN NaN 1784.0 1427.0 NaN NaN NaN NaN 694.0 NaN
2015 NaN NaN 1123.0 588.0 889.0 NaN NaN 514.0 1479.0 NaN ... NaN NaN 1837.0 1294.0 NaN NaN NaN NaN 561.0 NaN
2016 NaN NaN 518.0 500.0 899.0 NaN 429.0 462.0 1524.0 NaN ... NaN NaN 1486.0 1000.0 NaN NaN NaN NaN NaN NaN
2017 NaN NaN 480.0 NaN 831.0 NaN NaN NaN 1297.0 NaN ... NaN NaN 1224.0 767.0 NaN NaN NaN NaN NaN NaN
2018 NaN NaN NaN NaN 594.0 NaN NaN NaN 1287.0 NaN ... NaN NaN 1068.0 702.0 NaN NaN NaN NaN NaN NaN
2019 NaN NaN NaN NaN 553.0 NaN NaN 320.0 1404.0 NaN ... NaN NaN 903.0 596.0 NaN NaN NaN NaN NaN NaN
2020 NaN NaN NaN NaN 173.0 NaN NaN 135.0 536.0 NaN ... NaN NaN 349.0 182.0 NaN NaN NaN NaN NaN NaN

13 rows × 321 columns

In [166]:
total_births[['민준','하준','서연','지우']]
Out[166]:
name 민준 하준 서연 지우
year
2008 2642.0 NaN 3280.0 2815.0
2009 3103.0 496.0 3514.0 3541.0
2010 3601.0 678.0 3518.0 3762.0
2011 4026.0 1160.0 3111.0 3933.0
2012 3691.0 1210.0 3250.0 3938.0
2013 2769.0 1454.0 3133.0 3749.0
2014 4137.0 2977.0 3334.0 3508.0
2015 3821.0 3053.0 3015.0 3783.0
2016 2959.0 2894.0 2551.0 3270.0
2017 2246.0 2723.0 2075.0 2734.0
2018 2022.0 2627.0 1784.0 2572.0
2019 1648.0 2309.0 1406.0 2320.0
2020 586.0 858.0 511.0 901.0
In [130]:
subset = total_births[['민준','하준','서연','지우']]
subset.plot(subplots=True,figsize=(12,10),grid=False,title='Number of birth per year')
Out[130]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x000001929A4731C0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000001929A494460>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000001929A4B26A0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000001929A4DE820>],
      dtype=object)
In [168]:
table = top100.pivot_table('prop', index = 'year',columns='gender',aggfunc=sum)
table
Out[168]:
gender F M
year
2008 0.578999 0.527945
2009 0.579783 0.538014
2010 0.571665 0.539687
2011 0.570670 0.551487
2012 0.574947 0.552797
2013 0.582466 0.556796
2014 0.595898 0.576415
2015 0.593721 0.578102
2016 0.588982 0.575888
2017 0.593945 0.582100
2018 0.599030 0.593791
2019 0.599991 0.603443
2020 0.604493 0.611663
In [173]:
import matplotlib.pyplot as plt
plt.figure()
table = top100.pivot_table('prop', index = 'year',columns='gender',aggfunc=sum)

table.plot(title='Sum of table100.prop by year and sex',
           yticks=np.linspace(0.5, 0.7, 5), xticks=range(2008, 2021, 1))
Out[173]:
<matplotlib.axes._subplots.AxesSubplot at 0x1929b8cd550>
<Figure size 432x288 with 0 Axes>
In [179]:
df = boys[boys.year == 2010]
df
Out[179]:
name gender births year prop
500 민준 M 3601 2010 0.020302
501 지훈 M 2106 2010 0.011873
502 예준 M 2076 2010 0.011704
503 현우 M 2064 2010 0.011636
504 지호 M 2058 2010 0.011603
... ... ... ... ... ...
595 도영 M 482 2010 0.002717
596 승훈 M 481 2010 0.002712
597 재훈 M 479 2010 0.002700
598 상현 M 478 2010 0.002695
599 은우 M 473 2010 0.002667

100 rows × 5 columns

In [185]:
prop_cumsum = df.sort_values(by='prop',ascending=False).prop.cumsum()
prop_cumsum.values.searchsorted(0.5)+1
Out[185]:
86
In [188]:
prop_cumsum[:10]
Out[188]:
500    0.020302
501    0.032175
502    0.043879
503    0.055515
504    0.067118
505    0.078692
506    0.089906
507    0.100871
508    0.111758
509    0.122492
Name: prop, dtype: float64
In [177]:
prop_cumsum.values
Out[177]:
array([0.02030162, 0.03217477, 0.04387879, 0.05551515, 0.06711769,
       0.07869204, 0.08990557, 0.10087104, 0.11175758, 0.1224919 ,
       0.13315292, 0.14364482, 0.15284567, 0.16193939, 0.17068358,
       0.17936011, 0.18793517, 0.19608175, 0.2040592 , 0.21187315,
       0.21932065, 0.22615363, 0.23295278, 0.23966173, 0.24625229,
       0.25280902, 0.25927555, 0.26525722, 0.27121635, 0.2771642 ,
       0.28307259, 0.28867089, 0.29419591, 0.29955743, 0.30474982,
       0.30973925, 0.31471741, 0.31961099, 0.32446512, 0.32927977,
       0.33408316, 0.33875123, 0.34334602, 0.3479408 , 0.3524623 ,
       0.35696124, 0.36139253, 0.36582382, 0.37017054, 0.37451727,
       0.37884144, 0.38308668, 0.38729246, 0.39140803, 0.39545595,
       0.39941367, 0.40336575, 0.40721071, 0.41103312, 0.41477097,
       0.41845243, 0.42212826, 0.42577026, 0.42938971, 0.43299789,
       0.43660606, 0.44016913, 0.44361381, 0.44700775, 0.45040169,
       0.45373362, 0.45704863, 0.46035236, 0.46363354, 0.46686399,
       0.47008879, 0.47323467, 0.47637491, 0.47950388, 0.48255955,
       0.48560958, 0.48863707, 0.49165891, 0.49466949, 0.49766314,
       0.50063425, 0.50356589, 0.5064919 , 0.50940662, 0.51226498,
       0.51510078, 0.51789711, 0.52068781, 0.52347287, 0.52619591,
       0.52891332, 0.53162509, 0.53432558, 0.53702044, 0.5396871 ])
In [189]:
df = boys[boys.year == 2020]
y2020 = df.sort_values(by='prop',ascending=False).prop.cumsum()
y2020.values.searchsorted(0.5)+1
Out[189]:
64
In [136]:
def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

diversity = top100.groupby(['year', 'gender']).apply(get_quantile_count)
diversity = diversity.unstack('gender')
fig = plt.figure()
diversity.plot(title="Number of popular names in top 50%")
Out[136]:
<matplotlib.axes._subplots.AxesSubplot at 0x1929a22f1f0>
<Figure size 432x288 with 0 Axes>

마지막 글자의 변화

In [137]:
get_last_letter= lambda x: x[-1]
last_letters= names.name.map(get_last_letter)
last_letters.name= 'last_letter'

table = names.pivot_table('births', index=last_letters, columns=['gender', 'year'], aggfunc=sum)
In [138]:
subtable= table.reindex(columns=[2008,2010,2020], level='year')
subtable.head()
Out[138]:
gender F M
year 2008 2010 2020 2008 2010 2020
last_letter
NaN NaN NaN 106.0 NaN 34.0
NaN NaN NaN 1755.0 2208.0 688.0
NaN NaN NaN NaN NaN NaN
184.0 90.0 NaN 1159.0 968.0 348.0
NaN 65.0 65.0 92.0 NaN 446.0
In [139]:
subtable.sum()
letter_prop= subtable / subtable.sum()
letter_prop
Out[139]:
gender F M
year 2008 2010 2020 2008 2010 2020
last_letter
NaN NaN NaN 0.000627 NaN 0.000676
NaN NaN NaN 0.010381 0.012448 0.013680
NaN NaN NaN NaN NaN NaN
0.000962 0.000460 NaN 0.006855 0.005457 0.006919
NaN 0.000332 0.001239 0.000544 NaN 0.008868
... ... ... ... ... ... ...
0.003220 0.003467 0.000457 0.012120 0.025872 0.029427
NaN NaN NaN 0.044309 0.038269 0.019684
NaN NaN NaN NaN NaN NaN
NaN NaN NaN 0.001686 0.001466 0.001153
0.035534 0.033262 0.022351 0.012705 0.011704 0.006939

128 rows × 6 columns

In [140]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(30, 11))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
                      legend=False)
Out[140]:
<matplotlib.axes._subplots.AxesSubplot at 0x1929a6d2c10>
In [141]:
letter_prop= table / table.sum()
dny_ts= letter_prop.loc[['우','준', '현'], 'M'].T 
dny_ts.head()
Out[141]:
last_letter
year
2008 0.098378 0.095745 0.088730
2009 0.106945 0.103008 0.077567
2010 0.112214 0.112383 0.072896
2011 0.113082 0.118197 0.073600
2012 0.120518 0.116748 0.074191
In [142]:
plt.close()
fig= plt.figure()
dny_ts.plot()
Out[142]:
<matplotlib.axes._subplots.AxesSubplot at 0x1929d75ea60>
<Figure size 432x288 with 0 Axes>

남자이름>여자이름
공통부분: 진

In [144]:
all_names = pd.Series(top100.name.unique())
jin_like = all_names[all_names.str.lower().str.contains('진')]
jin_like
Out[144]:
10     유진
13     예진
28     서진
53     수진
104    우진
125    진우
177    현진
270    하진
dtype: object
In [145]:
filtered = top100[top100.name.isin(jin_like)]
filtered.groupby('name').births.sum()
Out[145]:
name
서진    25862
수진     2884
예진    13152
우진    18955
유진    14801
진우    10936
하진     4010
현진      498
Name: births, dtype: int64
In [146]:
table = filtered.pivot_table('births', index='year',
                             columns='gender', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()
Out[146]:
gender F M
year
2016 0.337038 0.662962
2017 0.303000 0.697000
2018 0.328807 0.671193
2019 0.288462 0.711538
2020 0.269565 0.730435
In [147]:
fig = plt.figure()
table.plot(style={'M': 'k-', 'F': 'b--'})
Out[147]:
<matplotlib.axes._subplots.AxesSubplot at 0x1929b62d100>
<Figure size 432x288 with 0 Axes>
In [ ]:
 

'Python' 카테고리의 다른 글

Seaborn, Matplotlib  (0) 2020.10.04
마크다운 요약  (0) 2020.09.29
Python_데이터분석2  (0) 2020.09.15
Python_판다스_데이터분석  (0) 2020.09.14
Python_example  (0) 2020.09.11

+ Recent posts