import pandas as pd 
import numpy as np

name_1880= pd.read_csv('data2/yob1880.txt', names= ['name', 'gender', 'births'])
name_1880

name_1880.groupby('gender').births.sum()

gender
F     90993
M    110493
Name: births, dtype: int64

years= range(1880,2011)
pieces= []
columns= ['name','gender','births']

for year in years:
    path= 'data2/yob{}.txt.'.format(year)
    frame= pd.read_csv(path, names=columns)
    frame['year']= year
    pieces.append(frame)
names= pd.concat(pieces, ignore_index= True)

names

total_births= names.pivot_table('births', index='year', columns='gender', aggfunc=sum)
total_births.plot(title='Total births(gender / year)')

<matplotlib.axes._subplots.AxesSubplot at 0x207c061d7f0>

def add_prop(group):
    group['prop'] = group.births / group.births.sum()
    return group

name= names.groupby(['year', 'gender']).apply(add_prop)
name

Mary prop:0.077643은 1880년도에 태어난 여자 중에서 약 0.078%차지

names = names.groupby(['year','gender']).apply(add_prop)
names

연도별 / 성별에 따른 선호하는 이름 1000개 추출¶

def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]

grouped= names.groupby(['year','gender'])
top1000= grouped.apply(get_top1000)
top1000

top1000.reset_index(inplace=True, drop=True)
top1000

상위 1000개의 이름데이터를 남자(boys)와 여자(grils)로 분리¶

boys= top1000[top1000.gender== 'M']
girls= top1000[top1000.gender== 'F']

연도와 출생수를 피봇테이블로 변환

total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)
total_births.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131 entries, 1880 to 2010
Columns: 6868 entries, Aaden to Zuri
dtypes: float64(6868)
memory usage: 6.9 MB

total_births.head()

subset = total_births[['John','Harry','Mary','Alice']]
subset.plot(subplots=True,figsize =(12,10), grid=False,
           title='Number of birth per year')

array([<matplotlib.axes._subplots.AxesSubplot object at 0x00000207B9D5A370>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x00000207B3CD0190>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x00000207B40C83D0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x00000207B40F5550>],
      dtype=object)

import matplotlib.pyplot as plt
plt.figure()
table = top1000.pivot_table('prop', index = 'year',columns='gender',aggfunc=sum)

table.plot(title='Sum of table1000.prop by year and sex',
           yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))

<matplotlib.axes._subplots.AxesSubplot at 0x207c293fe20>

<Figure size 432x288 with 0 Axes>

df = boys[boys.year == 2010]
df

prop_cumsum = df.sort_values(by='prop',ascending=False).prop.cumsum()
prop_cumsum[:10]
prop_cumsum.values.searchsorted(0.5)+1

117

가정: 흔한 이름 쓰는 걸 좋아하지 않는다.

df= boys[boys.year == 1910] 
y1910= df.sort_values(by='prop', ascending= False).prop.cumsum()
y1910.values.searchsorted(0.5)+1

31

def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

diversity = top1000.groupby(['year', 'gender']).apply(get_quantile_count)
diversity = diversity.unstack('gender')
fig = plt.figure()
diversity.plot(title="Number of popular names in top 50%")

<matplotlib.axes._subplots.AxesSubplot at 0x207b4679f40>

<Figure size 432x288 with 0 Axes>

마지막 글자의 변화¶

get_last_letter= lambda x: x[-1]
last_letters= names.name.map(get_last_letter)
last_letters.name= 'last_letter'

table = names.pivot_table('births', index=last_letters, columns=['gender', 'year'], aggfunc=sum)

subtable= table.reindex(columns=[1910,1960,2010], level='year')
subtable.head()

subtable.sum()
letter_prop= subtable / subtable.sum()
letter_prop

import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(15, 11))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
                      legend=False)

<matplotlib.axes._subplots.AxesSubplot at 0x207b4855b80>

letter_prop= table / table.sum()
dny_ts= letter_prop.loc[['d','n','y'], 'M'].T 
dny_ts.head()

plt.close()
fig= plt.figure()
dny_ts.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x207c0ec6d90>

<Figure size 432x288 with 0 Axes>

남자이름>여자이름 Lesley or Leslie
공통부분: Lesl

all_names = pd.Series(top1000.name.unique())
lesley_like = all_names[all_names.str.lower().str.contains('lesl')]
lesley_like

632     Leslie
2294    Lesley
4262    Leslee
4728     Lesli
6103     Lesly
dtype: object

filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()

name
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births, dtype: int64

table = filtered.pivot_table('births', index='year',
                             columns='gender', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()

fig = plt.figure()
table.plot(style={'M': 'k-', 'F': 'b--'})

<matplotlib.axes._subplots.AxesSubplot at 0x207c70116d0>

<Figure size 432x288 with 0 Axes>

	name	gender	births
0	Mary	F	7065
1	Anna	F	2604
2	Emma	F	2003
3	Elizabeth	F	1939
4	Minnie	F	1746
...	...	...	...
1995	Woodie	M	5
1996	Worthy	M	5
1997	Wright	M	5
1998	York	M	5
1999	Zachariah	M	5

	name	gender	births	year
0	Mary	F	7065	1880
1	Anna	F	2604	1880
2	Emma	F	2003	1880
3	Elizabeth	F	1939	1880
4	Minnie	F	1746	1880
...	...	...	...	...
1690779	Zymaire	M	5	2010
1690780	Zyonne	M	5	2010
1690781	Zyquarius	M	5	2010
1690782	Zyran	M	5	2010
1690783	Zzyzx	M	5	2010

	name	gender	births	year	prop
0	Mary	F	7065	1880	0.077643
1	Anna	F	2604	1880	0.028618
2	Emma	F	2003	1880	0.022013
3	Elizabeth	F	1939	1880	0.021309
4	Minnie	F	1746	1880	0.019188
...	...	...	...	...	...
1690779	Zymaire	M	5	2010	0.000003
1690780	Zyonne	M	5	2010	0.000003
1690781	Zyquarius	M	5	2010	0.000003
1690782	Zyran	M	5	2010	0.000003
1690783	Zzyzx	M	5	2010	0.000003

	name	gender	births	year	prop
0	Mary	F	7065	1880	0.077643
1	Anna	F	2604	1880	0.028618
2	Emma	F	2003	1880	0.022013
3	Elizabeth	F	1939	1880	0.021309
4	Minnie	F	1746	1880	0.019188
...	...	...	...	...	...
1690779	Zymaire	M	5	2010	0.000003
1690780	Zyonne	M	5	2010	0.000003
1690781	Zyquarius	M	5	2010	0.000003
1690782	Zyran	M	5	2010	0.000003
1690783	Zzyzx	M	5	2010	0.000003

			name	gender	births	year	prop
year	gender
1880	F	0	Mary	F	7065	1880	0.077643
		1	Anna	F	2604	1880	0.028618
		2	Emma	F	2003	1880	0.022013
		3	Elizabeth	F	1939	1880	0.021309
		4	Minnie	F	1746	1880	0.019188
...	...	...	...	...	...	...	...
2010	M	1677639	Camilo	M	194	2010	0.000102
		1677640	Destin	M	194	2010	0.000102
		1677641	Jaquan	M	194	2010	0.000102
		1677642	Jaydan	M	194	2010	0.000102
		1677645	Maxton	M	193	2010	0.000102

.

Python_데이터분석2

연도별 / 성별에 따른 선호하는 이름 1000개 추출¶

상위 1000개의 이름데이터를 남자(boys)와 여자(grils)로 분리¶

마지막 글자의 변화¶

'Python' 카테고리의 다른 글

+ Recent posts

티스토리툴바

name	Aaden	Aaliyah	Aarav	Aaron	Aarush	Ab	Abagail	Abb	Abbey	Abbie	...	Zoa	Zoe	Zoey	Zoie	Zola	Zollie	Zona	Zora	Zula	Zuri
year
1880	NaN	NaN	NaN	102.0	NaN	NaN	NaN	NaN	NaN	71.0	...	8.0	23.0	NaN	NaN	7.0	NaN	8.0	28.0	27.0	NaN
1881	NaN	NaN	NaN	94.0	NaN	NaN	NaN	NaN	NaN	81.0	...	NaN	22.0	NaN	NaN	10.0	NaN	9.0	21.0	27.0	NaN
1882	NaN	NaN	NaN	85.0	NaN	NaN	NaN	NaN	NaN	80.0	...	8.0	25.0	NaN	NaN	9.0	NaN	17.0	32.0	21.0	NaN
1883	NaN	NaN	NaN	105.0	NaN	NaN	NaN	NaN	NaN	79.0	...	NaN	23.0	NaN	NaN	10.0	NaN	11.0	35.0	25.0	NaN
1884	NaN	NaN	NaN	97.0	NaN	NaN	NaN	NaN	NaN	98.0	...	13.0	31.0	NaN	NaN	14.0	6.0	8.0	58.0	27.0	NaN

	name	gender	births	year	prop
260877	Jacob	M	21875	2010	0.011523
260878	Ethan	M	17866	2010	0.009411
260879	Michael	M	17133	2010	0.009025
260880	Jayden	M	17030	2010	0.008971
260881	William	M	16870	2010	0.008887
...	...	...	...	...	...
261872	Camilo	M	194	2010	0.000102
261873	Destin	M	194	2010	0.000102
261874	Jaquan	M	194	2010	0.000102
261875	Jaydan	M	194	2010	0.000102
261876	Maxton	M	193	2010	0.000102

gender	F			M
year	1910	1960	2010	1910	1960	2010
last_letter
a	108376.0	691247.0	670605.0	977.0	5204.0	28438.0
b	NaN	694.0	450.0	411.0	3912.0	38859.0
c	5.0	49.0	946.0	482.0	15476.0	23125.0
d	6750.0	3729.0	2607.0	22111.0	262112.0	44398.0
e	133569.0	435013.0	313833.0	28655.0	178823.0	129012.0

last_letter	d	n	y
year
1880	0.083055	0.153213	0.075760
1881	0.083247	0.153214	0.077451
1882	0.085340	0.149560	0.077537
1883	0.084066	0.151646	0.079144
1884	0.086120	0.149915	0.080405

마크다운 요약 (0)	2020.09.29
Python_우리나라 아기이름 데이터 분석 (1)	2020.09.15
Python_판다스_데이터분석 (0)	2020.09.14
Python_example (0)	2020.09.11
python_pandas(판다스): 계층적 색인 지정, 누락된 데이터처리, 결측치채우기, 데이터 변형하기, onehot인코딩 (0)	2020.09.11

gender	F			M
year	1910	1960	2010	1910	1960	2010
last_letter
a	0.273390	0.341853	0.381240	0.005031	0.002440	0.014980
b	NaN	0.000343	0.000256	0.002116	0.001834	0.020470
c	0.000013	0.000024	0.000538	0.002482	0.007257	0.012181
d	0.017028	0.001844	0.001482	0.113858	0.122908	0.023387
e	0.336941	0.215133	0.178415	0.147556	0.083853	0.067959
f	NaN	0.000010	0.000055	0.000783	0.004325	0.001188
g	0.000144	0.000157	0.000374	0.002250	0.009488	0.001404
h	0.051529	0.036224	0.075852	0.045562	0.037907	0.051670
i	0.001526	0.039965	0.031734	0.000844	0.000603	0.022628
j	NaN	NaN	0.000090	NaN	NaN	0.000769
k	0.000121	0.000156	0.000356	0.036581	0.049384	0.018541
l	0.043189	0.033867	0.026356	0.065016	0.104904	0.070367
m	0.001201	0.008613	0.002588	0.058044	0.033827	0.024657
n	0.079240	0.130687	0.140210	0.143415	0.152522	0.362771
o	0.001660	0.002439	0.001243	0.017065	0.012829	0.042681
p	0.000018	0.000023	0.000020	0.003172	0.005675	0.001269
q	NaN	NaN	0.000030	NaN	NaN	0.000180
r	0.013390	0.006764	0.018025	0.064481	0.031034	0.087477
s	0.039042	0.012764	0.013332	0.130815	0.102730	0.065145
t	0.027438	0.015201	0.007830	0.072879	0.065655	0.022861
u	0.000684	0.000574	0.000417	0.000124	0.000057	0.001221
v	NaN	0.000060	0.000117	0.000113	0.000037	0.001434
w	0.000020	0.000031	0.001182	0.006329	0.007711	0.016148
x	0.000015	0.000037	0.000727	0.003965	0.001851	0.008614
y	0.110972	0.152569	0.116828	0.077349	0.160987	0.058168
z	0.002439	0.000659	0.000704	0.000170	0.000184	0.001831

gender	F	M
year
2006	1.0	NaN
2007	1.0	NaN
2008	1.0	NaN
2009	1.0	NaN
2010	1.0	NaN