import pandas as pd
import numpy as np
import matplotlib.font_manager as fm
font_path = 'C:/Windows/Fonts/NGULIM.TTF'
fontprop = fm.FontProperties(fname=font_path, size=15)
font_family = fm.FontProperties(fname=font_path).get_name()

plt.rcParams["font.family"] = font_family

years = range(2008,2021)
pieces = [] #전체 연도의 리스트를 합칠 것
columns = ['name','gender','births']

for year in years:
    path = 'korea/y{}.txt'.format(year)
    frame = pd.read_csv(path,names=columns)
    frame['year']=year
    pieces.append(frame)

names = pd.concat(pieces,ignore_index=True)

names

total_births = names.pivot_table('births', index='year',columns='gender',aggfunc=sum)
total_births

total_births = names.pivot_table('births', index='year',columns='gender',aggfunc=sum)
total_births.plot(title='total births(gender/year)')

<matplotlib.axes._subplots.AxesSubplot at 0x19294539b50>

def add_prop(group):
    group['prop'] = group.births / group.births.sum()
    return group

names = names.groupby(['year','gender']).apply(add_prop)
names

names.groupby(['year','gender']).prop.sum()

year  gender
2008  F         1.0
      M         1.0
2009  F         1.0
      M         1.0
2010  F         1.0
      M         1.0
2011  F         1.0
      M         1.0
2012  F         1.0
      M         1.0
2013  F         1.0
      M         1.0
2014  F         1.0
      M         1.0
2015  F         1.0
      M         1.0
2016  F         1.0
      M         1.0
2017  F         1.0
      M         1.0
2018  F         1.0
      M         1.0
2019  F         1.0
      M         1.0
2020  F         1.0
      M         1.0
Name: prop, dtype: float64

연도별/성별에 따른 선호하는 이름 100개 추출¶

names

def get_top100(group):
    return group.sort_values(by='births',ascending=False)[:100]

grouped = names.groupby(['year','gender'])
top100 = grouped.apply(get_top100)
top100

top100.reset_index(inplace = True , drop = True)
top100

상위 100개의 이름데이터를 남자(boys)와 여자(girls)로 분리¶

boys = top100[top100.gender == 'M']
girls = top100[top100.gender == 'F']

total_births = top100.pivot_table('births',index ='year',columns='name',aggfunc=sum)
total_births.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13 entries, 2008 to 2020
Columns: 321 entries, 가연 to 혜인
dtypes: float64(321)
memory usage: 32.7 KB

total_births

total_births[['민준','하준','서연','지우']]

subset = total_births[['민준','하준','서연','지우']]
subset.plot(subplots=True,figsize=(12,10),grid=False,title='Number of birth per year')

array([<matplotlib.axes._subplots.AxesSubplot object at 0x000001929A4731C0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000001929A494460>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000001929A4B26A0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000001929A4DE820>],
      dtype=object)

table = top100.pivot_table('prop', index = 'year',columns='gender',aggfunc=sum)
table

import matplotlib.pyplot as plt
plt.figure()
table = top100.pivot_table('prop', index = 'year',columns='gender',aggfunc=sum)

table.plot(title='Sum of table100.prop by year and sex',
           yticks=np.linspace(0.5, 0.7, 5), xticks=range(2008, 2021, 1))

<matplotlib.axes._subplots.AxesSubplot at 0x1929b8cd550>

<Figure size 432x288 with 0 Axes>

df = boys[boys.year == 2010]
df

prop_cumsum = df.sort_values(by='prop',ascending=False).prop.cumsum()
prop_cumsum.values.searchsorted(0.5)+1

86

prop_cumsum[:10]

500    0.020302
501    0.032175
502    0.043879
503    0.055515
504    0.067118
505    0.078692
506    0.089906
507    0.100871
508    0.111758
509    0.122492
Name: prop, dtype: float64

prop_cumsum.values

array([0.02030162, 0.03217477, 0.04387879, 0.05551515, 0.06711769,
       0.07869204, 0.08990557, 0.10087104, 0.11175758, 0.1224919 ,
       0.13315292, 0.14364482, 0.15284567, 0.16193939, 0.17068358,
       0.17936011, 0.18793517, 0.19608175, 0.2040592 , 0.21187315,
       0.21932065, 0.22615363, 0.23295278, 0.23966173, 0.24625229,
       0.25280902, 0.25927555, 0.26525722, 0.27121635, 0.2771642 ,
       0.28307259, 0.28867089, 0.29419591, 0.29955743, 0.30474982,
       0.30973925, 0.31471741, 0.31961099, 0.32446512, 0.32927977,
       0.33408316, 0.33875123, 0.34334602, 0.3479408 , 0.3524623 ,
       0.35696124, 0.36139253, 0.36582382, 0.37017054, 0.37451727,
       0.37884144, 0.38308668, 0.38729246, 0.39140803, 0.39545595,
       0.39941367, 0.40336575, 0.40721071, 0.41103312, 0.41477097,
       0.41845243, 0.42212826, 0.42577026, 0.42938971, 0.43299789,
       0.43660606, 0.44016913, 0.44361381, 0.44700775, 0.45040169,
       0.45373362, 0.45704863, 0.46035236, 0.46363354, 0.46686399,
       0.47008879, 0.47323467, 0.47637491, 0.47950388, 0.48255955,
       0.48560958, 0.48863707, 0.49165891, 0.49466949, 0.49766314,
       0.50063425, 0.50356589, 0.5064919 , 0.50940662, 0.51226498,
       0.51510078, 0.51789711, 0.52068781, 0.52347287, 0.52619591,
       0.52891332, 0.53162509, 0.53432558, 0.53702044, 0.5396871 ])

df = boys[boys.year == 2020]
y2020 = df.sort_values(by='prop',ascending=False).prop.cumsum()
y2020.values.searchsorted(0.5)+1

64

def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

diversity = top100.groupby(['year', 'gender']).apply(get_quantile_count)
diversity = diversity.unstack('gender')
fig = plt.figure()
diversity.plot(title="Number of popular names in top 50%")

<matplotlib.axes._subplots.AxesSubplot at 0x1929a22f1f0>

<Figure size 432x288 with 0 Axes>

마지막 글자의 변화

get_last_letter= lambda x: x[-1]
last_letters= names.name.map(get_last_letter)
last_letters.name= 'last_letter'

table = names.pivot_table('births', index=last_letters, columns=['gender', 'year'], aggfunc=sum)

subtable= table.reindex(columns=[2008,2010,2020], level='year')
subtable.head()

subtable.sum()
letter_prop= subtable / subtable.sum()
letter_prop

import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(30, 11))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
                      legend=False)

<matplotlib.axes._subplots.AxesSubplot at 0x1929a6d2c10>

letter_prop= table / table.sum()
dny_ts= letter_prop.loc[['우','준', '현'], 'M'].T 
dny_ts.head()

plt.close()
fig= plt.figure()
dny_ts.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1929d75ea60>

<Figure size 432x288 with 0 Axes>

남자이름>여자이름
공통부분: 진

all_names = pd.Series(top100.name.unique())
jin_like = all_names[all_names.str.lower().str.contains('진')]
jin_like

10     유진
13     예진
28     서진
53     수진
104    우진
125    진우
177    현진
270    하진
dtype: object

filtered = top100[top100.name.isin(jin_like)]
filtered.groupby('name').births.sum()

name
서진    25862
수진     2884
예진    13152
우진    18955
유진    14801
진우    10936
하진     4010
현진      498
Name: births, dtype: int64

table = filtered.pivot_table('births', index='year',
                             columns='gender', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()

fig = plt.figure()
table.plot(style={'M': 'k-', 'F': 'b--'})

<matplotlib.axes._subplots.AxesSubplot at 0x1929b62d100>

<Figure size 432x288 with 0 Axes>

	name	gender	births	year
0	서연	F	3280	2008
1	민서	F	2873	2008
2	지민	F	2826	2008
3	서현	F	2606	2008
4	서윤	F	2484	2008
...	...	...	...	...
12995	민승	M	20	2020
12996	규담	M	20	2020
12997	영웅	M	20	2020
12998	재성	M	20	2020
12999	주빈	M	20	2020

gender	F	M
year
2008	191282	169063
2009	186337	166347
2010	195842	177375
2011	199164	180055
2012	204024	185399
2013	189671	173103
2014	187089	174520
2015	190049	178228
2016	176951	165692
2017	154957	148076
2018	144814	137774
2019	133687	128559
2020	52482	50294

	name	gender	births	year	prop
0	서연	F	3280	2008	0.017147
1	민서	F	2873	2008	0.015020
2	지민	F	2826	2008	0.014774
3	서현	F	2606	2008	0.013624
4	서윤	F	2484	2008	0.012986
...	...	...	...	...	...
12995	민승	M	20	2020	0.000398
12996	규담	M	20	2020	0.000398
12997	영웅	M	20	2020	0.000398
12998	재성	M	20	2020	0.000398
12999	주빈	M	20	2020	0.000398

	name	gender	births	year	prop
0	서연	F	3280	2008	0.017147
1	민서	F	2873	2008	0.015020
2	지민	F	2826	2008	0.014774
3	서현	F	2606	2008	0.013624
4	서윤	F	2484	2008	0.012986
...	...	...	...	...	...
12995	민승	M	20	2020	0.000398
12996	규담	M	20	2020	0.000398
12997	영웅	M	20	2020	0.000398
12998	재성	M	20	2020	0.000398
12999	주빈	M	20	2020	0.000398

			name	gender	births	year	prop
year	gender
2008	F	0	서연	F	3280	2008	0.017147
		1	민서	F	2873	2008	0.015020
		2	지민	F	2826	2008	0.014774
		3	서현	F	2606	2008	0.013624
		4	서윤	F	2484	2008	0.012986
...	...	...	...	...	...	...	...
2020	M	12597	승민	M	126	2020	0.002505
		12598	시환	M	126	2020	0.002505
		12595	재하	M	126	2020	0.002505
		12596	태준	M	126	2020	0.002505
		12599	서율	M	122	2020	0.002426

.

Python_우리나라 아기이름 데이터 분석

연도별/성별에 따른 선호하는 이름 100개 추출¶

상위 100개의 이름데이터를 남자(boys)와 여자(girls)로 분리¶

'Python' 카테고리의 다른 글

+ Recent posts

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

name	가연	가영	가온	가윤	가은	가현	강민	건	건우	건호	...	현수	현아	현우	현준	현지	현진	현호	형준	혜원	혜인
year
2008	772.0	737.0	NaN	NaN	1571.0	728.0	NaN	532.0	1703.0	NaN	...	768.0	NaN	1924.0	1615.0	991.0	498.0	NaN	472.0	1007.0	NaN
2009	664.0	642.0	NaN	NaN	1703.0	677.0	519.0	489.0	1784.0	NaN	...	687.0	NaN	1909.0	1691.0	891.0	NaN	NaN	NaN	914.0	NaN
2010	652.0	598.0	NaN	554.0	1651.0	680.0	527.0	NaN	1945.0	531.0	...	701.0	576.0	2064.0	1989.0	822.0	NaN	494.0	NaN	1070.0	NaN
2011	NaN	NaN	NaN	734.0	1420.0	635.0	531.0	NaN	1746.0	NaN	...	583.0	586.0	2032.0	1785.0	687.0	NaN	NaN	NaN	966.0	646.0
2012	640.0	NaN	NaN	720.0	1333.0	659.0	493.0	NaN	2050.0	NaN	...	517.0	NaN	2055.0	1560.0	NaN	NaN	NaN	NaN	725.0	NaN
2013	NaN	NaN	NaN	782.0	1121.0	NaN	454.0	NaN	1577.0	NaN	...	468.0	NaN	1861.0	1426.0	NaN	NaN	NaN	NaN	660.0	NaN
2014	NaN	NaN	NaN	600.0	1044.0	NaN	NaN	485.0	1349.0	NaN	...	NaN	NaN	1784.0	1427.0	NaN	NaN	NaN	NaN	694.0	NaN
2015	NaN	NaN	1123.0	588.0	889.0	NaN	NaN	514.0	1479.0	NaN	...	NaN	NaN	1837.0	1294.0	NaN	NaN	NaN	NaN	561.0	NaN
2016	NaN	NaN	518.0	500.0	899.0	NaN	429.0	462.0	1524.0	NaN	...	NaN	NaN	1486.0	1000.0	NaN	NaN	NaN	NaN	NaN	NaN
2017	NaN	NaN	480.0	NaN	831.0	NaN	NaN	NaN	1297.0	NaN	...	NaN	NaN	1224.0	767.0	NaN	NaN	NaN	NaN	NaN	NaN
2018	NaN	NaN	NaN	NaN	594.0	NaN	NaN	NaN	1287.0	NaN	...	NaN	NaN	1068.0	702.0	NaN	NaN	NaN	NaN	NaN	NaN
2019	NaN	NaN	NaN	NaN	553.0	NaN	NaN	320.0	1404.0	NaN	...	NaN	NaN	903.0	596.0	NaN	NaN	NaN	NaN	NaN	NaN
2020	NaN	NaN	NaN	NaN	173.0	NaN	NaN	135.0	536.0	NaN	...	NaN	NaN	349.0	182.0	NaN	NaN	NaN	NaN	NaN	NaN

name	민준	하준	서연	지우
year
2008	2642.0	NaN	3280.0	2815.0
2009	3103.0	496.0	3514.0	3541.0
2010	3601.0	678.0	3518.0	3762.0
2011	4026.0	1160.0	3111.0	3933.0
2012	3691.0	1210.0	3250.0	3938.0
2013	2769.0	1454.0	3133.0	3749.0
2014	4137.0	2977.0	3334.0	3508.0
2015	3821.0	3053.0	3015.0	3783.0
2016	2959.0	2894.0	2551.0	3270.0
2017	2246.0	2723.0	2075.0	2734.0
2018	2022.0	2627.0	1784.0	2572.0
2019	1648.0	2309.0	1406.0	2320.0
2020	586.0	858.0	511.0	901.0

gender	F	M
year
2008	0.578999	0.527945
2009	0.579783	0.538014
2010	0.571665	0.539687
2011	0.570670	0.551487
2012	0.574947	0.552797
2013	0.582466	0.556796
2014	0.595898	0.576415
2015	0.593721	0.578102
2016	0.588982	0.575888
2017	0.593945	0.582100
2018	0.599030	0.593791
2019	0.599991	0.603443
2020	0.604493	0.611663

	name	gender	births	year	prop
500	민준	M	3601	2010	0.020302
501	지훈	M	2106	2010	0.011873
502	예준	M	2076	2010	0.011704
503	현우	M	2064	2010	0.011636
504	지호	M	2058	2010	0.011603
...	...	...	...	...	...
595	도영	M	482	2010	0.002717
596	승훈	M	481	2010	0.002712
597	재훈	M	479	2010	0.002700
598	상현	M	478	2010	0.002695
599	은우	M	473	2010	0.002667

last_letter	우	준	현
year
2008	0.098378	0.095745	0.088730
2009	0.106945	0.103008	0.077567
2010	0.112214	0.112383	0.072896
2011	0.113082	0.118197	0.073600
2012	0.120518	0.116748	0.074191

gender	F	M
year
2016	0.337038	0.662962
2017	0.303000	0.697000
2018	0.328807	0.671193
2019	0.288462	0.711538
2020	0.269565	0.730435

Seaborn, Matplotlib (0)	2020.10.04
마크다운 요약 (0)	2020.09.29
Python_데이터분석2 (0)	2020.09.15
Python_판다스_데이터분석 (0)	2020.09.14
Python_example (0)	2020.09.11

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`