import pandas as pd
import numpy as np
import matplotlib.font_manager as fm
font_path = 'C:/Windows/Fonts/NGULIM.TTF'
fontprop = fm.FontProperties(fname=font_path, size=15)
font_family = fm.FontProperties(fname=font_path).get_name()

plt.rcParams["font.family"] = font_family

years = range(2008,2021)
pieces = [] #전체 연도의 리스트를 합칠 것
columns = ['name','gender','births']

for year in years:
    path = 'korea/y{}.txt'.format(year)
    frame = pd.read_csv(path,names=columns)
    frame['year']=year
    pieces.append(frame)

names = pd.concat(pieces,ignore_index=True)

names

total_births = names.pivot_table('births', index='year',columns='gender',aggfunc=sum)
total_births

total_births = names.pivot_table('births', index='year',columns='gender',aggfunc=sum)
total_births.plot(title='total births(gender/year)')

<matplotlib.axes._subplots.AxesSubplot at 0x19294539b50>

def add_prop(group):
    group['prop'] = group.births / group.births.sum()
    return group

names = names.groupby(['year','gender']).apply(add_prop)
names

names.groupby(['year','gender']).prop.sum()

year  gender
2008  F         1.0
      M         1.0
2009  F         1.0
      M         1.0
2010  F         1.0
      M         1.0
2011  F         1.0
      M         1.0
2012  F         1.0
      M         1.0
2013  F         1.0
      M         1.0
2014  F         1.0
      M         1.0
2015  F         1.0
      M         1.0
2016  F         1.0
      M         1.0
2017  F         1.0
      M         1.0
2018  F         1.0
      M         1.0
2019  F         1.0
      M         1.0
2020  F         1.0
      M         1.0
Name: prop, dtype: float64

연도별/성별에 따른 선호하는 이름 100개 추출¶

names

def get_top100(group):
    return group.sort_values(by='births',ascending=False)[:100]

grouped = names.groupby(['year','gender'])
top100 = grouped.apply(get_top100)
top100

top100.reset_index(inplace = True , drop = True)
top100

상위 100개의 이름데이터를 남자(boys)와 여자(girls)로 분리¶

boys = top100[top100.gender == 'M']
girls = top100[top100.gender == 'F']

total_births = top100.pivot_table('births',index ='year',columns='name',aggfunc=sum)
total_births.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13 entries, 2008 to 2020
Columns: 321 entries, 가연 to 혜인
dtypes: float64(321)
memory usage: 32.7 KB

total_births

total_births[['민준','하준','서연','지우']]

subset = total_births[['민준','하준','서연','지우']]
subset.plot(subplots=True,figsize=(12,10),grid=False,title='Number of birth per year')

array([<matplotlib.axes._subplots.AxesSubplot object at 0x000001929A4731C0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000001929A494460>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000001929A4B26A0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000001929A4DE820>],
      dtype=object)

table = top100.pivot_table('prop', index = 'year',columns='gender',aggfunc=sum)
table

import matplotlib.pyplot as plt
plt.figure()
table = top100.pivot_table('prop', index = 'year',columns='gender',aggfunc=sum)

table.plot(title='Sum of table100.prop by year and sex',
           yticks=np.linspace(0.5, 0.7, 5), xticks=range(2008, 2021, 1))

<matplotlib.axes._subplots.AxesSubplot at 0x1929b8cd550>

<Figure size 432x288 with 0 Axes>

df = boys[boys.year == 2010]
df

prop_cumsum = df.sort_values(by='prop',ascending=False).prop.cumsum()
prop_cumsum.values.searchsorted(0.5)+1

86

prop_cumsum[:10]

500    0.020302
501    0.032175
502    0.043879
503    0.055515
504    0.067118
505    0.078692
506    0.089906
507    0.100871
508    0.111758
509    0.122492
Name: prop, dtype: float64

prop_cumsum.values

array([0.02030162, 0.03217477, 0.04387879, 0.05551515, 0.06711769,
       0.07869204, 0.08990557, 0.10087104, 0.11175758, 0.1224919 ,
       0.13315292, 0.14364482, 0.15284567, 0.16193939, 0.17068358,
       0.17936011, 0.18793517, 0.19608175, 0.2040592 , 0.21187315,
       0.21932065, 0.22615363, 0.23295278, 0.23966173, 0.24625229,
       0.25280902, 0.25927555, 0.26525722, 0.27121635, 0.2771642 ,
       0.28307259, 0.28867089, 0.29419591, 0.29955743, 0.30474982,
       0.30973925, 0.31471741, 0.31961099, 0.32446512, 0.32927977,
       0.33408316, 0.33875123, 0.34334602, 0.3479408 , 0.3524623 ,
       0.35696124, 0.36139253, 0.36582382, 0.37017054, 0.37451727,
       0.37884144, 0.38308668, 0.38729246, 0.39140803, 0.39545595,
       0.39941367, 0.40336575, 0.40721071, 0.41103312, 0.41477097,
       0.41845243, 0.42212826, 0.42577026, 0.42938971, 0.43299789,
       0.43660606, 0.44016913, 0.44361381, 0.44700775, 0.45040169,
       0.45373362, 0.45704863, 0.46035236, 0.46363354, 0.46686399,
       0.47008879, 0.47323467, 0.47637491, 0.47950388, 0.48255955,
       0.48560958, 0.48863707, 0.49165891, 0.49466949, 0.49766314,
       0.50063425, 0.50356589, 0.5064919 , 0.50940662, 0.51226498,
       0.51510078, 0.51789711, 0.52068781, 0.52347287, 0.52619591,
       0.52891332, 0.53162509, 0.53432558, 0.53702044, 0.5396871 ])

df = boys[boys.year == 2020]
y2020 = df.sort_values(by='prop',ascending=False).prop.cumsum()
y2020.values.searchsorted(0.5)+1

64

def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

diversity = top100.groupby(['year', 'gender']).apply(get_quantile_count)
diversity = diversity.unstack('gender')
fig = plt.figure()
diversity.plot(title="Number of popular names in top 50%")

<matplotlib.axes._subplots.AxesSubplot at 0x1929a22f1f0>

<Figure size 432x288 with 0 Axes>

마지막 글자의 변화

get_last_letter= lambda x: x[-1]
last_letters= names.name.map(get_last_letter)
last_letters.name= 'last_letter'

table = names.pivot_table('births', index=last_letters, columns=['gender', 'year'], aggfunc=sum)

subtable= table.reindex(columns=[2008,2010,2020], level='year')
subtable.head()

subtable.sum()
letter_prop= subtable / subtable.sum()
letter_prop

import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(30, 11))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
                      legend=False)

<matplotlib.axes._subplots.AxesSubplot at 0x1929a6d2c10>

letter_prop= table / table.sum()
dny_ts= letter_prop.loc[['우','준', '현'], 'M'].T 
dny_ts.head()

plt.close()
fig= plt.figure()
dny_ts.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x1929d75ea60>

<Figure size 432x288 with 0 Axes>

남자이름>여자이름
공통부분: 진

all_names = pd.Series(top100.name.unique())
jin_like = all_names[all_names.str.lower().str.contains('진')]
jin_like

10     유진
13     예진
28     서진
53     수진
104    우진
125    진우
177    현진
270    하진
dtype: object

filtered = top100[top100.name.isin(jin_like)]
filtered.groupby('name').births.sum()

name
서진    25862
수진     2884
예진    13152
우진    18955
유진    14801
진우    10936
하진     4010
현진      498
Name: births, dtype: int64

table = filtered.pivot_table('births', index='year',
                             columns='gender', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()

fig = plt.figure()
table.plot(style={'M': 'k-', 'F': 'b--'})

<matplotlib.axes._subplots.AxesSubplot at 0x1929b62d100>

<Figure size 432x288 with 0 Axes>

import pandas as pd 
import numpy as np

name_1880= pd.read_csv('data2/yob1880.txt', names= ['name', 'gender', 'births'])
name_1880

name_1880.groupby('gender').births.sum()

gender
F     90993
M    110493
Name: births, dtype: int64

years= range(1880,2011)
pieces= []
columns= ['name','gender','births']

for year in years:
    path= 'data2/yob{}.txt.'.format(year)
    frame= pd.read_csv(path, names=columns)
    frame['year']= year
    pieces.append(frame)
names= pd.concat(pieces, ignore_index= True)

names

total_births= names.pivot_table('births', index='year', columns='gender', aggfunc=sum)
total_births.plot(title='Total births(gender / year)')

<matplotlib.axes._subplots.AxesSubplot at 0x207c061d7f0>

def add_prop(group):
    group['prop'] = group.births / group.births.sum()
    return group

name= names.groupby(['year', 'gender']).apply(add_prop)
name

Mary prop:0.077643은 1880년도에 태어난 여자 중에서 약 0.078%차지

names = names.groupby(['year','gender']).apply(add_prop)
names

연도별 / 성별에 따른 선호하는 이름 1000개 추출¶

def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]

grouped= names.groupby(['year','gender'])
top1000= grouped.apply(get_top1000)
top1000

top1000.reset_index(inplace=True, drop=True)
top1000

상위 1000개의 이름데이터를 남자(boys)와 여자(grils)로 분리¶

boys= top1000[top1000.gender== 'M']
girls= top1000[top1000.gender== 'F']

연도와 출생수를 피봇테이블로 변환

total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)
total_births.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131 entries, 1880 to 2010
Columns: 6868 entries, Aaden to Zuri
dtypes: float64(6868)
memory usage: 6.9 MB

total_births.head()

subset = total_births[['John','Harry','Mary','Alice']]
subset.plot(subplots=True,figsize =(12,10), grid=False,
           title='Number of birth per year')

array([<matplotlib.axes._subplots.AxesSubplot object at 0x00000207B9D5A370>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x00000207B3CD0190>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x00000207B40C83D0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x00000207B40F5550>],
      dtype=object)

import matplotlib.pyplot as plt
plt.figure()
table = top1000.pivot_table('prop', index = 'year',columns='gender',aggfunc=sum)

table.plot(title='Sum of table1000.prop by year and sex',
           yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))

<matplotlib.axes._subplots.AxesSubplot at 0x207c293fe20>

<Figure size 432x288 with 0 Axes>

df = boys[boys.year == 2010]
df

prop_cumsum = df.sort_values(by='prop',ascending=False).prop.cumsum()
prop_cumsum[:10]
prop_cumsum.values.searchsorted(0.5)+1

117

가정: 흔한 이름 쓰는 걸 좋아하지 않는다.

df= boys[boys.year == 1910] 
y1910= df.sort_values(by='prop', ascending= False).prop.cumsum()
y1910.values.searchsorted(0.5)+1

31

def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

diversity = top1000.groupby(['year', 'gender']).apply(get_quantile_count)
diversity = diversity.unstack('gender')
fig = plt.figure()
diversity.plot(title="Number of popular names in top 50%")

<matplotlib.axes._subplots.AxesSubplot at 0x207b4679f40>

<Figure size 432x288 with 0 Axes>

마지막 글자의 변화¶

get_last_letter= lambda x: x[-1]
last_letters= names.name.map(get_last_letter)
last_letters.name= 'last_letter'

table = names.pivot_table('births', index=last_letters, columns=['gender', 'year'], aggfunc=sum)

subtable= table.reindex(columns=[1910,1960,2010], level='year')
subtable.head()

subtable.sum()
letter_prop= subtable / subtable.sum()
letter_prop

import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(15, 11))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
                      legend=False)

<matplotlib.axes._subplots.AxesSubplot at 0x207b4855b80>

letter_prop= table / table.sum()
dny_ts= letter_prop.loc[['d','n','y'], 'M'].T 
dny_ts.head()

plt.close()
fig= plt.figure()
dny_ts.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x207c0ec6d90>

<Figure size 432x288 with 0 Axes>

남자이름>여자이름 Lesley or Leslie
공통부분: Lesl

all_names = pd.Series(top1000.name.unique())
lesley_like = all_names[all_names.str.lower().str.contains('lesl')]
lesley_like

632     Leslie
2294    Lesley
4262    Leslee
4728     Lesli
6103     Lesly
dtype: object

filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()

name
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births, dtype: int64

table = filtered.pivot_table('births', index='year',
                             columns='gender', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()

fig = plt.figure()
table.plot(style={'M': 'k-', 'F': 'b--'})

<matplotlib.axes._subplots.AxesSubplot at 0x207c70116d0>

<Figure size 432x288 with 0 Axes>

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

read_csv로 데이터를 Dataframe형태로 불러옴¶

file_path='data/drinks.csv'
drinks=pd.read_csv(file_path)

info()함수를 통해 결측(NaN)값을 확인할 수 있음

print(drinks.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     170 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB
None

drinks.head(10) #디폴트 값은 5

drinks.describe() # 숫자만 describe로 나옴

corr=drinks[['beer_servings', 'wine_servings']].corr(method='pearson') #두 변수 X 와 Y 간의 선형 상관 관계를 계량화한 수치다
print(corr) # beer가 1일

beer_servings  wine_servings
beer_servings       1.000000       0.527172
wine_servings       0.527172       1.000000

cols=['beer_servings','spirit_servings','wine_servings','total_litres_of_pure_alcohol']
corr= drinks[cols].corr(method='pearson')
print(corr)

beer_servings  spirit_servings  wine_servings  \
beer_servings                      1.000000         0.458819       0.527172   
spirit_servings                    0.458819         1.000000       0.194797   
wine_servings                      0.527172         0.194797       1.000000   
total_litres_of_pure_alcohol       0.835839         0.654968       0.667598   

                              total_litres_of_pure_alcohol  
beer_servings                                     0.835839  
spirit_servings                                   0.654968  
wine_servings                                     0.667598  
total_litres_of_pure_alcohol                      1.000000

corr.values

array([[1.        , 0.45881887, 0.52717169, 0.83583863],
       [0.45881887, 1.        , 0.19479705, 0.65496818],
       [0.52717169, 0.19479705, 1.        , 0.66759834],
       [0.83583863, 0.65496818, 0.66759834, 1.        ]])

heatmap[heat(열)+map(지도)]: 데이터들의 배열을 색상으로 표현해주는 그래프

heatmap을 사용하면 두 개의 카테고리 값에 대한 값 변화를 한눈에 알기 쉬움
대용량 데이터도 쉽게 표현 가능

import seaborn as sns
# corr 행렬 히트맵을 시각화합니다.
cols_view = ['beer', 'spirit', 'wine', 'alcohol'] # 그래프 출력을 위한 cols 이름을 축약합니다.
sns.set(font_scale=1.5)
hm = sns.heatmap(corr.values,
            cbar=True, #컬러바 유무
            annot=True, # 각 셀에 숫자 입력 
            fmt='.2f', #둘째자리까지
            square=True,
            annot_kws={'size': 15}, #숫자 크기 조절 
            yticklabels=cols_view,
            xticklabels=cols_view)

plt.tight_layout() # 여백
plt.show()

# 시각화 라이브러리를 이용한 피처간의 scatter plot을 출력합니다.
sns.set(style='whitegrid', context='notebook')
sns.pairplot(drinks[['beer_servings', 'spirit_servings', 
                     'wine_servings', 'total_litres_of_pure_alcohol']], height=2.5)
plt.show()
# 점 하나는 나라 하나

correlation : 각 숫자형 변수들 간의 상관관계를 나타낸 계수¶

1~0.7는 강한 양의 상관관계 ex) y = x
0.7~-1은 강한 음의 상관관계 ex) y= -x
(0에 가까울 수록 두 변수들 간의 상관이 없다)

scatterplot(산점도) : 탐색적 분석에서 필수적으로 사용하며 각 점들을 찍은 그래프(역시 숫자형 변수들만)¶

그래프를 이용하여 각 변수들 간의 상관관계나 모양 같은 것을 봄
y = x 와 같은 상관관계를 쉽게 볼 수 있고, 다른 관계도 추측할 수 있다.

결측 데이터 전처리¶

print(drinks.isnull().sum())
print("----------------------------------------")
print(drinks.dtypes)

country                          0
beer_servings                    0
spirit_servings                  0
wine_servings                    0
total_litres_of_pure_alcohol     0
continent                       23
dtype: int64
----------------------------------------
country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

결측 데이터 채우기

drinks['continent']=drinks['continent'].fillna('OT') #OT=OTHER

print(drinks.isnull().sum())

country                         0
beer_servings                   0
spirit_servings                 0
wine_servings                   0
total_litres_of_pure_alcohol    0
continent                       0
dtype: int64

print(drinks.info()) # 5번 193으로 달라짐

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB
None

labels= drinks['continent'].value_counts().index.tolist() #index를 리스트로 만들고
fracs1=drinks['continent'].value_counts().values.tolist() #continent의 valuses를 리스트로 
explode= (0,0,0,0.25,0,0)
plt.pie(fracs1,explode=explode, labels=labels, autopct='%.0f%%', shadow=True) #shadow=True 그림자 붙이기 
plt.title("NULL Data to 'OT'")
plt.show()

labels= drinks['continent'].value_counts().index.tolist()
print(labels)

['AF', 'EU', 'AS', 'OT', 'OC', 'SA']

fracs1=drinks['continent'].value_counts().values.tolist()
print(fracs1)

[53, 45, 44, 23, 16, 12]

#plt.rcParams.update({'font.size': 15}) #(1)
plt.figure(figsize=(5,5)) #그림크기
#plt.rc('font',size=15) #폰트크기 (2)
explode= (0,0,0,0.2,0,0)
plt.pie(fracs1, #데이터값,values, 
explode=explode, #파이에서 튀어나오는 위치 
labels=labels, 
colors=None, 
autopct='%.1f%%',
pctdistance=0.6, 
shadow=False,
labeldistance=1.1, 
startangle=230, # 90넣으면 반시계방향으로 90도 돔
radius=None, 
counterclock=True, #반시계방향, False하면 위치바뀜 
wedgeprops=None, 
textprops=None, 
center=(0, 0), 
frame=False, 
rotatelabels=False)
plt.title("NULL Data to 'OT'")
plt.show()
#plt.rcParams.update({'font.size': 15}) (1)
plt.rc('font',size=10) #폰트 크기를 원래대로 지정 (2)

https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.pie.html

result=drinks.groupby('continent').spirit_servings.agg(['mean','min','max','sum'])
result.head(10)

result=drinks.groupby('continent').spirit_servings. \
        agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)

result=drinks.groupby('continent').beer_servings. \
        agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)

result=drinks.groupby('continent').wine_servings. \
        agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)

result=drinks.groupby('continent').total_litres_of_pure_alcohol. \
        agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)

total_litres_of_pure_alcohol의 전체평균

total_mean=drinks.total_litres_of_pure_alcohol.mean()
total_mean

4.717098445595855

continent_mean=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
print(continent_mean)

continent
AF    3.007547
AS    2.170455
EU    8.617778
OC    3.381250
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64

continent_over_mean=continent_mean[continent_mean >=total_mean]
print(continent_over_mean)

continent
EU    8.617778
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64

평균 beer_servings가 가장 높은 대륙

beer_continent= drinks.groupby('continent').beer_servings.mean()
beer_continent

continent
AF     61.471698
AS     37.045455
EU    193.777778
OC     89.687500
OT    145.434783
SA    175.083333
Name: beer_servings, dtype: float64

beer_continent_max=beer_continent[beer_continent==beer_continent.max()]
beer_continent_max

continent
EU    193.777778
Name: beer_servings, dtype: float64

beer_continent.idxmax()

'EU'

beer_continent[beer_continent.idxmax()]

193.77777777777777

평균 beer_servings가 가장 낮은 대륙

beer_continent_min=beer_continent[beer_continent==beer_continent.min()]
beer_continent_min

continent
AS    37.045455
Name: beer_servings, dtype: float64

beer_continent.idxmin()

'AS'

beer_continent[beer_continent.idxmin()]

37.04545454545455

시각화¶

대륙별 spirit_servings의 평균,최소,최대를 시각화

result=drinks.groupby('continent').spirit_servings. \
        agg(['mean','min','max','count'])
result.head(10)

number_groups=len(result.index) 
number_groups

6

means= result['mean'].tolist()
means

[16.339622641509433,
 60.84090909090909,
 132.55555555555554,
 58.4375,
 165.7391304347826,
 114.75]

mins= result['min'].tolist()
maxs= result['max'].tolist()
counts= result['count'].tolist() 
index=np.arange(number_groups)

plt.figure(figsize=(10,5)) #그림크기
bar_width= 0.2 #0.8 디폴트 

rects_1= plt.bar(index, means, bar_width, color='r', label='Mean')
rects_2= plt.bar(index+bar_width, mins, bar_width, color='g', label='Min')
rects_3= plt.bar(index+bar_width*2, maxs, bar_width, color='b', label='Max')
rects_4= plt.bar(index+bar_width*3, counts, bar_width, color='y', label='Count')

plt.xticks(index, result.index.tolist())
plt.legend()
plt.show()

https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.bar.html?highlight=bar#matplotlib.pyplot.bar

대륙별 total_litres_of_pure_alcohol 시각화

continent_mean=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
continent_mean

continent
AF    3.007547
AS    2.170455
EU    8.617778
OC    3.381250
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64

continents= continent_mean.index.tolist()
continents

['AF', 'AS', 'EU', 'OC', 'OT', 'SA']

continents.append('MEAN')
continents

['AF', 'AS', 'EU', 'OC', 'OT', 'SA', 'MEAN']

x_pos=np.arange(len(continents)) #그래프 세로
x_pos

array([0, 1, 2, 3, 4, 5, 6])

alcohol= continent_mean.tolist()
alcohol.append(total_mean)
alcohol

[3.00754716981132,
 2.1704545454545454,
 8.617777777777777,
 3.3812500000000005,
 5.995652173913044,
 6.308333333333334,
 4.717098445595855]

total_mean

4.717098445595855

#plt.grid(b=True, which='both', axis='both')
bar_list = plt.bar(x_pos, alcohol, align='center', alpha=0.5) 
bar_list[len(continents) - 1].set_color('r') # 맨 마지막을 빨간색으로 
plt.plot([0., 6], [total_mean, total_mean], "k--") #점선 그리기 
plt.xticks(x_pos, continents)

plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by Continent')

plt.show()

#plt.grid(b=True, which='both', axis='both')
bar_list = plt.bar(x_pos, alcohol, align='center', alpha=0.5) 
bar_list[len(continents) - 1].set_color('g') 
plt.plot([0., 6], [total_mean, total_mean], "k--") #점선 그리기 
plt.xticks(x_pos, continents)

plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by Continent')

plt.show()

continent_mean

continent
AF    3.007547
AS    2.170455
EU    8.617778
OC    3.381250
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64

#plt.grid(b=True, which='both', axis='both')
bar_list = plt.bar(x_pos, alcohol, align='center', alpha=0.5) 
bar_list[len(continents) - 1].set_color('g') 
plt.plot([3., 5], [continent_mean['SA'], continent_mean['SA']], "k--") #점선 그리기 
plt.xticks(x_pos, continents)

plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by Continent')

plt.show()

https://matplotlib.org/3.3.1/api/_as_gen/matplotlib.pyplot.plot.html

!pip install scipy

Requirement already satisfied: scipy in c:\users\w\anaconda3\lib\site-packages (1.5.0)
Requirement already satisfied: numpy>=1.14.5 in c:\users\w\anaconda3\lib\site-packages (from scipy) (1.18.5)

africa=drinks.loc[drinks['continent']=='AF']
africa.head()

europe=drinks.loc[drinks['continent']=='EU']
europe.head()

drinks.head()

drinks['total_servings']=drinks['beer_servings']+ drinks['wine_servings']+ drinks['spirit_servings']

drinks['alcohol_rate']= drinks['total_litres_of_pure_alcohol'] / drinks['total_servings']

drinks['alcohol_rate']= drinks['alcohol_rate'].fillna(0)

drinks.head()

country_rank= drinks[['country','alcohol_rate']]
country_rank= country_rank.sort_values(by=['alcohol_rate'], ascending= 0)
country_rank.head()

country_list = country_rank.country.tolist()
x_pos = np.arange(len(country_list))
rank = country_rank.alcohol_rate.tolist()
 
bar_list = plt.bar(x_pos, rank)
bar_list[country_list.index("South Korea")].set_color('r')
plt.ylabel('alcohol rate')
plt.title('liquor drink rank by contry')
plt.axis([0, 200, 0, 0.3])

korea_rank = country_list.index("South Korea")
korea_alc_rate = country_rank[country_rank['country'] == 'South Korea']['alcohol_rate'].values[0]
plt.annotate('South Korea : ' + str(korea_rank + 1), 
             xy=(korea_rank, korea_alc_rate), 
             xytext=(korea_rank + 10, korea_alc_rate + 0.05),
             arrowprops=dict(facecolor='red', shrink=0.05))

plt.show()

total_litres_of_pure_alcohol 평균을 구하기¶

이 평균값보다 적은 알코올을 섭취하는 대륙중에서 spirit을 가장 많이 마시는 국가를 구해보자¶

total_mean=drinks.total_litres_of_pure_alcohol.mean()
total_mean

4.717098445595855

continent_sum=drinks.groupby('continent').spirit_servings.agg(['sum'])
continent_sum

continent_sum['mean']=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
continent_sum

continent_sum[continent_mean<total_mean]

continent_sum[continent_mean<total_mean].loc[:,'sum'].idxmax()

'AS'

1)¶

a= drinks['continent'].str.contains('AS|AF|OC')
print(a)

0       True
1      False
2       True
3      False
4       True
       ...  
188    False
189     True
190     True
191     True
192     True
Name: continent, Length: 193, dtype: bool

b = drinks[a]
print(b)

country  beer_servings  spirit_servings  wine_servings  \
0    Afghanistan              0                0              0   
2        Algeria             25                0             14   
4         Angola            217               57             45   
8      Australia            261               72            212   
12       Bahrain             42               63              7   
..           ...            ...              ...            ...   
187      Vanuatu             21               18             11   
189      Vietnam            111                2              1   
190        Yemen              6                0              0   
191       Zambia             32               19              4   
192     Zimbabwe             64               18              4   

     total_litres_of_pure_alcohol continent  total_servings  alcohol_rate  
0                             0.0        AS               0      0.000000  
2                             0.7        AF              39      0.017949  
4                             5.9        AF             319      0.018495  
8                            10.4        OC             545      0.019083  
12                            2.0        AS             112      0.017857  
..                            ...       ...             ...           ...  
187                           0.9        OC              50      0.018000  
189                           2.0        AS             114      0.017544  
190                           0.1        AS               6      0.016667  
191                           2.5        AF              55      0.045455  
192                           4.7        AF              86      0.054651  

[113 rows x 8 columns]

spirit_continent_max = b[b['spirit_servings'] == b['spirit_servings'].max()]
spirit_continent_max

2)¶

a = drinks[drinks['continent'].isin(['AS', 'AF', 'OC'])]
a

spirit_continent_max = a[a['spirit_servings'] == a['spirit_servings'].max()]
spirit_continent_max

drinks

total_litres_of_pure_alcohol 평균을 구하기¶

이 평균값보다 적은 알코올을 섭취하는 대륙중에서 spirit을 가장 많이 마시는 국가를 구해보자¶

drinks

total_mean=drinks.total_litres_of_pure_alcohol.mean()
total_mean

4.717098445595855

continent_mean=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
continent_mean

continent
AF    3.007547
AS    2.170455
EU    8.617778
OC    3.381250
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64

 continent_under_mean= continent_mean[continent_mean <= total_mean].index.tolist()
 continent_under_mean

['AF', 'AS', 'OC']

A[A.column_name.isin(B)]¶

A데이터 프레임에서 A column_name 피처가 B안에 포함되어 있는 데이터만 가져옴

df_continent_under_mean= drinks.loc[drinks.continent.isin(continent_under_mean)] 
df_continent_under_mean

most_spirit_under_mean= df_continent_under_mean.loc[df_continent_under_mean['spirit_servings'].idxmax()]
most_spirit_under_mean

country                         Russian Federation
beer_servings                                  247
spirit_servings                                326
wine_servings                                   73
total_litres_of_pure_alcohol                  11.5
continent                                       AS
total_servings                                 646
alcohol_rate                             0.0178019
Name: 141, dtype: object

10살 단위로 인구를 정리한다.
70대 이상은 하나로 합친다.
3개의 기간동안 인구가 증가하는 곳을 찾는다.
남성이 여성보다 많이 거주하는 곳 가운데 인구가 3만 이상인 곳을 찾는다. (없으면 없다)

import pandas as pd
import numpy as np

2018.06.30.기준 대구광역시 주민등록인구 현황¶

population_01 = pd.read_csv('data/2018.06.30.기준 대구광역시 주민등록인구 현황_UTF8.csv')
population_01

df =population_01.copy()

drop_list = []
check_list = ['총계', '구분', ' 남', ' 여'] # 남구 삭제 방지를 위해 '남' 앞에 공백 추가 
for chk in check_list:
    for item in df.columns :
        if  chk in item:
            drop_list.append(item)
drop_list = list(set(drop_list)) #  중복 제거를 위해 set을 사용

df1 = df.drop(drop_list,axis=1)
df1

df1['인구0'] = (pd.DataFrame(range(0,101))/10).astype(int)*10
df1

df1['인구0'] = (pd.DataFrame(range(0,101))/10).astype(int)*10
df1 = df1.groupby('인구0')
df1 = df1.sum()
df1

df1['인구'] = df1.index
df1['인구'] = np.where(df1['인구'] >= 70, 70, df1.인구)
df1 = df1.groupby('인구')
df1 = df1.sum()
df1

#df1['인구'] = (pd.DataFrame(range(0,101))/10).astype(int)*10
#df1['인구'] = np.where(df1['인구'] >= 70, 70, df1.인구)
#df1 = df1.groupby('인구')
#df1 = df1.sum()
#df1

2018.12.31.기준 대구광역시 주민등록인구 현황¶

population_02 = pd.read_csv('data/2018.12.31.기준 대구광역시 주민등록인구 현황_UTF8.csv')
population_02.head()

df2 =population_02.copy()

drop_list = []
check_list = ['총계', '구분', ' 남', ' 여'] # 남구 삭제 방지를 위해 '남' 앞에 공백 추가 
for chk in check_list:
    for item in df2.columns :
        if  chk in item:
            drop_list.append(item)
drop_list = list(set(drop_list)) #  중복 제거를 위해 set을 사용
df2_1 = df2.drop(drop_list,axis=1)
df2_1['인구'] = (pd.DataFrame(range(0,101))/10).astype(int)*10
df2_1['인구'] = np.where(df2_1['인구'] >= 70, 70, df2_1.인구)
df2_1 = df2_1.groupby('인구')
df2_1 = df2_1.sum()
df2_1

2019.06.30.기준 대구광역시 주민등록인구 현황¶

population_03 = pd.read_csv('data/2019.06.30.기준 대구광역시 주민등록인구 현황_UTF8.csv',thousands=',')
population_03.head()

df3 =population_03.copy()

drop_list = []
check_list = ['총계', '행정구역', ' 남', ' 여'] # 남구 삭제 방지를 위해 '남' 앞에 공백 추가 
for chk in check_list:
    for item in df3.columns :
        if  chk in item:
            drop_list.append(item)
drop_list = list(set(drop_list)) #  중복 제거를 위해 set을 사용

df3_1 = df3.drop(drop_list,axis=1)
df3_1['인구'] = (pd.DataFrame(range(0,101))/10).astype(int)*10
df3_1['인구'] = np.where(df3_1['인구'] >= 70, 70, df3_1.인구)
df3_1 = df3_1.groupby('인구')
df3_1 = df3_1.sum()
df3_1

3개의 기간동안 인구가 증가하는 곳을 찾는다¶

df1 = df1.T
df1['총합'] = df1.sum(axis=1).astype(int)
df1

df2_1 = df2_1.T
df2_1['총합'] = df2_1.sum(axis=1).astype(int)
df2_1

df3_1 = df3_1.T
df3_1['총합'] = df3_1.sum(axis=1).astype(int)
df3_1

pop1 = (df2_1['총합'] - df1['총합']) > 0
pop2 = (df3_1['총합'] - df2_1['총합']) > 0
df3_1['인구증가'] = pop1 & pop2
df3_1

df3_1[df3_1.인구증가 == True]

4. 다음조건 찾기¶

남성인구 > 여성인구
인구수 > 20만

df = population_01.copy()

#공통제거
drop_list = []
check_list = ['총계', '구분',]
for chk in check_list:
    for item in df.columns :
        if  chk in item:
            drop_list.append(item)
drop_list = list(set(drop_list)) #  중복 제거를 위해 set을 사용

df = df.drop(drop_list, axis=1)

df['인구'] = (pd.DataFrame(range(0,101))/10).astype(int)*10
df['인구'] = np.where(df['인구'] >= 70, 70, df.인구)
df = df.groupby('인구')
df = df.sum()
df = df.T
df['총합'] = df.sum(axis='columns').astype(int)
df = df.T
df

#남자만 뽑기
drop_list = []
check_list = [' 계', ' 여']
for chk in check_list:
    for item in df.columns :
        if  chk in item:
            drop_list.append(item)
drop_list = list(set(drop_list)) #  중복 제거를 위해 set을 사용
#drop_list

ex4_df1_M = df.drop(drop_list, axis=1)
ex4_df1_M = ex4_df1_M.T
ex4_df1_M

#여자만 뽑기
drop_list = []
check_list = [' 계', ' 남']
for chk in check_list:
    for item in df.columns :
        if  chk in item:
            drop_list.append(item)
drop_list = list(set(drop_list)) #  중복 제거를 위해 set을 사용
#drop_list

ex4_df1_F = df.drop(drop_list, axis=1)
ex4_df1_F = ex4_df1_F.T
ex4_df1_F

#합계만 뽑기
drop_list = []
check_list = [' 남', ' 여']
for chk in check_list:
    for item in df.columns :
        if  chk in item:
            drop_list.append(item)
drop_list = list(set(drop_list)) #  중복 제거를 위해 set을 사용
#drop_list

ex4_df1_T = df.drop(drop_list, axis=1)
ex4_df1_T = ex4_df1_T.T
ex4_df1_T

ex4_df1_T['남초여부'] = (ex4_df1_M['총합'].values > ex4_df1_F['총합'].values) 
# result = (ex4_df1_M['총합'].values > ex4_df1_F['총합'].values) > 0
# index = ex4_df1_T.index
# obj = pd.Series(result, index=index)
# ex4_df1_T['남초여부'] = obj
ex4_df1_T

case1 = ex4_df1_T['남초여부'] == True 
case2 = ex4_df1_T['총합']> 200000
case = case1 & case2
ex4_df1_T['결과'] = case
ex4_df1_T

ex4_df1_T[ex4_df1_T['결과'] == True]

ex4_df1_T = ex4_df1_T.T
ex4_df1_T

컬럼 이름 변경¶

새로운 리스트로 덮어씀

ex4_df1_T.columns = ['중구','동구','서구','남구','북구','수성구','달서구','달성군']
ex4_df1_T

replace를 사용하여 이름 명 변경(대체)

ex4_df1_T.columns = ex4_df1_T.columns.str.replace("구","구 계")
ex4_df1_T

각각 이름 변경시 rename을 사용

ex4_df1_T = ex4_df1_T.rename({'중구 계':'대구중구'}, axis='columns')
ex4_df1_T

5. 대구광역시의 각 구별 인구수를 정리하여 표시하고 인구가 가장 많은 지역과 가장 적은 지역을 구하여 출력 하라(3개 파일에서 각각 구하라)¶

#합계만 뽑기
drop_list = []
check_list = [' 남', ' 여']
for chk in check_list:
    for item in df.columns :
        if  chk in item:
            drop_list.append(item)
drop_list = list(set(drop_list)) #  중복 제거를 위해 set을 사용
#drop_list

df1_T = df.drop(drop_list, axis=1)
df1_T = df1_T.T
df1_T

df1_T1 =df1_T.copy()

max_1 = df1_T1['총합'].idxmax()
df1_T1[df1_T1.index == max_1 ]

min_1 = df1_T1['총합'].idxmin()
df1_T1[df1_T1.index == min_1 ]

max_1 = df1_T['총합'].idxmax() == df1_T1.index
min_1 = df1_T['총합'].idxmin() == df1_T1.index
df1_T1['최대인구 구'] = max_1
df1_T1['최소인구 구'] = min_1
df1_T1

df2 =population_02.copy()

#공통제거
drop_list = []
check_list = ['총계', '구분',]
for chk in check_list:
    for item in df2.columns :
        if  chk in item:
            drop_list.append(item)
drop_list = list(set(drop_list)) #  중복 제거를 위해 set을 사용

df2 = df2.drop(drop_list, axis=1)

df2['인구'] = (pd.DataFrame(range(0,101))/10).astype(int)*10
df2['인구'] = np.where(df2['인구'] >= 70, 70, df2.인구)
df2 = df2.groupby('인구')
df2 = df2.sum()
df2 = df2.T
df2['총합'] = df2.sum(axis='columns').astype(int)
df2 = df2.T
df2

#합계만 뽑기
drop_list = []
check_list = [' 남', ' 여']
for chk in check_list:
    for item in df2.columns :
        if  chk in item:
            drop_list.append(item)
drop_list = list(set(drop_list)) #  중복 제거를 위해 set을 사용
#drop_list

df2_T = df2.drop(drop_list, axis=1)
df2_T = df2_T.T
df2_T

df2_T2 =df2_T.copy()

max_2 = df2_T2['총합'].idxmax()
df2_T2[df2_T2.index == max_2 ]

min_2 = df2_T2['총합'].idxmin()
df2_T2[df2_T2.index == min_2 ]

max_2 = df2_T['총합'].idxmax() == df2_T2.index
min_2 = df2_T['총합'].idxmin() == df2_T2.index
df2_T2['최대인구 구'] = max_2
df2_T2['최소인구 구'] = min_2
df2_T2

df3 = population_03.copy()

#공통제거
drop_list = []
check_list = ['총계', '구분',]
for chk in check_list:
    for item in df3.columns :
        if  chk in item:
            drop_list.append(item)
drop_list = list(set(drop_list)) #  중복 제거를 위해 set을 사용

df3 = df3.drop(drop_list, axis=1)

df3['인구'] = (pd.DataFrame(range(0,101))/10).astype(int)*10
df3['인구'] = np.where(df3['인구'] >= 70, 70, df3.인구)
df3 = df3.groupby('인구')
df3 = df3.sum()
df3 = df3.T
df3['총합'] = df3.sum(axis='columns').astype(int)
df3 = df3.T
df3

drop_list = []
check_list = [' 남', ' 여']
for chk in check_list:
    for item in df3.columns :
        if  chk in item:
            drop_list.append(item)
drop_list = list(set(drop_list)) #  중복 제거를 위해 set을 사용
#drop_list

df3_T = df3.drop(drop_list, axis=1)
df3_T = df3_T.T
df3_T

df3_T3 =df3_T.copy()

max_3 = df3_T['총합'].idxmax()
df3_T3[df3_T3.index == max_3 ]

min_3 = df3_T3['총합'].idxmin()
df3_T3[df3_T3.index == min_1 ]

max_3 = df3_T['총합'].idxmax() == df3_T.index
min_3 = df3_T['총합'].idxmin() == df3_T.index
df3_T['최대인구 구'] = max_3
df3_T['최소인구 구'] = min_3
df3_T

6. 대구광역시의 각 구별 20대 대비 60세 이상의 인구 비율을 구하여 출력하고 이 비율이 가장 높은 지역을 구하여 출력하라¶

df1_T

df1_T['60대이상'] = (df1_T[60] + df1_T[70]).astype(int)
df1_T

df1_T['rate'] = round((df1_T['60대이상']/df1_T[20])*100,1)
df1_T

max_6_1 = df1_T['rate'].idxmax()
df1_T[df1_T.index == max_6_1 ]

df2_T

df2_T['60대이상'] = (df2_T[60] + df2_T[70]).astype(int)
df2_T

df2_T['rate'] = round((df2_T['60대이상']/df2_T[20])*100,1)
df2_T

max_6_2 = df2_T['rate'].idxmax()
df2_T[df2_T.index == max_6_2 ]

df3_T

df3_T['60대이상'] = (df3_T[60] + df3_T[70]).astype(int)
df3_T

df3_T['rate'] = round((df3_T['60대이상']/df3_T[20])*100,1)
df3_T

max_6_3 = df3_T['rate'].idxmax()
df3_T[df3_T.index == max_6_3 ]

import pandas as pd

df= pd.read_csv('data/titanic_train.csv', sep=',')

df.head()

df1=pd.read_csv('data/ex1.csv')

df1.head()

f = open("data/ex1.csv", 'w')
data = """a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo"""
f.write(data)
f.close()

f = open("data/ex2.csv", 'w')
data = """
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
"""
f.write(data)
f.close()

df2= pd.read_csv('data/ex2.csv', header=None)
df2

df2= pd.read_csv('data/ex2.csv', names=['a','b','c','d','message'])
df2

names=['a','b','c','d','message']
df2= pd.read_csv('data/ex2.csv', names=names, index_col='message') # message가 index의 column으로 감
df2

f = open("data/csv_mindex.csv", 'w')
data = """key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16
"""
f.write(data)
f.close()

계층적 색인 지정

parsed= pd.read_csv('data/csv_mindex.csv', index_col=['key1', 'key2'])
parsed

f = open("data/ex3.txt", 'w')
data = """A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871859 -0.348382  1.100491
"""
f.write(data)
f.close()

result= pd.read_table('data/ex3.txt', sep='\s+') #\s+ :공백이 하나 이상 발생한 경우 구분
result

f = open("data/ex4.csv", 'w')
data = """#Hey!
a,b,c,d,message
#by python
#csv file
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo"""
f.write(data)
f.close()

f = open("data/ex4.csv",'r')
skiplist = []
count=0
while True:
    line = f.readline()
    if not line:
        break
    if '#' in line :
        print(line)
        skiplist.append(count)
    count += 1
f.close()
skiplist

#Hey!

#by python

#csv file

[0, 2, 3]

f=open('data/ex4.csv', 'r', encoding='utf-8')
lineNum=0
skiplist=[]

while True:
    lines=f.readline()
    if not lines:
        break
    if '#' in lines:
        skiplist.append(lineNum)
    lineNum +=1 
f.close()
skiplist

[0, 2, 3]

df1= pd.read_csv('data/ex4.csv', skiprows=[0,2,3]) # 불필요한 줄 지움 
df1

df1= pd.read_csv('data/ex4.csv', skiprows= skiplist) # 불필요한 줄 지움 
df2

f = open("data/ex5.csv", 'w')
data = """something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo
"""
f.write(data)
f.close()

result= pd.read_csv('data/ex5.csv') 
result

pd.isnull(result)

result=pd.read_csv('data/ex5.csv', na_values=['NULL']) # na_values: 특정한 값을 NaN으로 취급하고 싶을 때 사용, 1 넣으면 1이 NaN으로 바뀜 
result

sentinels={'message': ['foo','world'], 'something': ['two']}
result=pd.read_csv('data/ex5.csv', na_values=sentinels)
result

엑셀파일읽기¶

xlsx = 'data/ex01.xlsx'
frame= pd.read_excel(xlsx, 'Sheet1')
frame

엑셀파일저장¶

writer= pd.ExcelWriter('data/ex02.xlsx')
frame.to_excel(writer, 'Sheet1')
writer.save()

누락된 데이터 처리¶

인자	설명
dropna	누락된 데이터가 있는 축(low,column)을 제외시킨다. 어느정도의 누락데이터까지 용인할것인지 지정 할 수 있다.
fillna	누락된 데이터를 대신할 값을 채우거나 'ffill'이나 'bfill'같은 보간 메서드를 적용한다.
isnull	누락되거나 NA인 값을 알려주는 불리언 값이 저장된 같은 형의 객체를 반환
notnull	isnull과 반대되는 메서드

import numpy as np 
import pandas as pd

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

string_data[0]= None 
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

from numpy import nan as NA 
data= pd.Series([1,NA,3.5,NA,7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

data[data.notnull()] #data.dropna()와 같은 결과

0    1.0
2    3.5
4    7.0
dtype: float64

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

cleand=data.dropna() #NaN이 하나라도 들어있으면 사라짐 
cleand

cleand= data.dropna(how="all") # how=all : 모두 NaN이면 삭제됨
cleand

# 4번 열을 추가하고 NA 값으로 지정 
data[4]=NA
data

cleand= data.dropna(axis='columns', how="all")
#cleand= data.dropna(axis='1', how="all") 위의 코드와 동일 
cleand

df= pd.DataFrame(np.random.randn(7,3)) #세로가 7개 가로가 3개인
df

df.iloc[0:3,1]=NA
df.iloc[:1,2]=NA
df

cleaned= df.dropna(thresh=2) # 2개이상 NaN이 있으면 삭제됨  
cleaned

결측치 채우기¶

df

filled=df.fillna(0) # NaN을 0으로 대체 
filled

filled2=df.fillna({1:0.9, 2:0})
filled2

df.fillna(0, inplace=False) #원본데이터 안바뀜
df

df.fillna(0, inplace=True) #원본데이터 0으로 바뀜 
df

df= pd.DataFrame(np.random.randn(7,3))
df.iloc[2:,1]=NA
df.iloc[4:,2]=NA
df

filled= df.fillna(method='ffill')
filled

filled= df.fillna(method='ffill', limit=2) #2개만 채움
filled

def.fillna(value=, method='ffill', axis=0, inplace=, limit=)¶

value : 비어있는 값을 채울 스칼라 값이나 dictonary 형식의 객체
method : 보간법(기본=ffill)
axis : 값을 채워넣을 축(기본 axis=0)
inplace : 복사본을 생성하지 않고 호출한 객체에 값을 반환(기본값=False )
limit : 값을 앞 또는 뒤로 몇개까지 채울지 지정

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

data.duplicated() # 중복이 되는 것 True

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

data.drop_duplicates() # 중복된 것 사라짐

data['v1']=range(7)
data

data.drop_duplicates(['k1']) #k1의 중복요소 삭제

data.drop_duplicates(['k2'])#k2의 중복요소 삭제

data.drop_duplicates(['v1']) # 중복이 없어서 똑같이 나옴

data

data.drop_duplicates(['k1','k2'], keep='last') # keep='last' : 마지막이 살아남음
# k1과 k2의 값이 같은 건 two 4 5/ two 4 6 이 중 마지막이 살아남음

데이터 변형하기¶

data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
meat_to_animal

{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}

#소문자로 바꾸기 
lower_cased=data['food'].str.lower()
lower_cased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

data['animal']= lower_cased.map(meat_to_animal)
data

#lower_cased=data['food'].str.lower()
#data['animal']= lower_cased.map(meat_to_animal)
data['animal']= data['food'].map(lambda x: meat_to_animal[x.lower()])
data

data=pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

data2 = data.replace(-999, np.nan)
data2

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

data2 = data.replace([-999, -1000], np.nan)
data2

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

data2 = data.replace([-999, -1000], [np.nan, 0])
data2

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

data2= data.replace({-999.:np.nan, -1000.:0})
#-999 ->Nan , -1000 ->0
data2

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

transform= lambda x:x[:4].upper() #X를 4자리로 끊음 
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

data.index

Index(['Ohio', 'Colorado', 'New York'], dtype='object')

data.index= data.index.map(transform)
data

str.title : 단어의 시작 부분에 있는 문자는 대문자로, 나머지는 모두 소문자로 만든다.

data.rename(index=str.title, columns=str.upper)

data

data.rename(index={'OHIO':'INDIANA'}, columns={'three':'peekaboo'})

data

data.rename(index={'OHIO':'INDIANA'}, inplace=True) # 기존 객체를 수정 
data

data

나이 나누기
18~25(0)
26~35(1)
35~60(2)
60이상(3)

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

bins=[18,25,35,60,100] # 각 구간을 나눠줄 숫자값 
cats= pd.cut(ages,bins) #pd.cut(카테고리화 할 숫자데이터, 자를 구간의 구분값)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

cats.codes #각 성분이 몇번쨰 구간에 속해있는지 정수 index로 표시됨

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

cats.categories # ( ] : 왼쪽 미포함, 오른쪽 포함

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

pd.value_counts(cats) # 각 구간의 성분의 개수

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

pd.cut(ages,[18,25,35,60,100], right=False) # [ ) : 왼쪽 포함, 오른쪽 미포함

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

pd.cut(ages,[18,25,35,60,100], right=True) # 위와 반대

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

group_names= ['Youth', 'youngAdult', 'MiddleAged', 'Senior']

data2= pd.cut(ages,bins,labels=group_names)
data2

[Youth, Youth, Youth, youngAdult, Youth, ..., youngAdult, Senior, MiddleAged, MiddleAged, youngAdult]
Length: 12
Categories (4, object): [Youth < youngAdult < MiddleAged < Senior]

data2.value_counts()

Youth         5
youngAdult    3
MiddleAged    3
Senior        1
dtype: int64

각 구간 구분값을 bin으로 정의해서 나누었는데 pandas에서 알아서 판단하여 데이터의 길이를 잘라주고 구간을 설정할 수도 있음.

data= np.random.rand(20)
data

array([0.7166629 , 0.27019459, 0.74586198, 0.46917089, 0.45317755,
       0.10626844, 0.08305884, 0.57318399, 0.5936407 , 0.78475135,
       0.33795658, 0.99624303, 0.2733613 , 0.89804862, 0.10861183,
       0.43367864, 0.09130907, 0.8281011 , 0.54757824, 0.0145025 ])

데이터 성분값을 기준으로 자동으로 구간을 나누게 하기 위해서 나눌 구간의 개수만 입력해준다 (성분의 최소값~최대값을 보고 4개구간 나눔)

pd.cut(data, 4, precision=2) #precision=2 소수점 2자리까지 표현

[(0.51, 0.75], (0.26, 0.51], (0.51, 0.75], (0.26, 0.51], (0.26, 0.51], ..., (0.26, 0.51], (0.014, 0.26], (0.75, 1.0], (0.51, 0.75], (0.014, 0.26]]
Length: 20
Categories (4, interval[float64]): [(0.014, 0.26] < (0.26, 0.51] < (0.51, 0.75] < (0.75, 1.0]]

data= pd.DataFrame(np.random.randn(1000,4)) #n붙이면 정규분포.. 
data

data.describe()

data[2] #column 2

0      2.304293
1     -0.256742
2     -0.285212
3      1.830050
4      0.856561
         ...   
995    1.182251
996    0.611918
997    2.086748
998    0.240342
999    0.889384
Name: 2, Length: 1000, dtype: float64

col= data[2]
col[np.abs(col)>3]

402   -3.170485
519   -3.460535
Name: 2, dtype: float64

data[(np.abs(data)>3).any(1)] #data가 절대값이 3보다 큰게 하나라도 있으면 무조건 출력

np.sign(x) : x<0일 때 -1, x==0일 떄 0 ,x>0일 때 1을 반환

data[np.abs(data)>3]= np.sign(data)*3 # 절대값이 3보다 큰 값들을 -1 또는 1로 나타내는데 *3을 해서 3 또는 -3으로 나타나짐
data.describe()

data.head()

np.sign(data).head() # 부호를 나타냄

df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

sampler = np.random.permutation(5) # 무작위로 섞인 배열 만든다
sampler

array([4, 1, 3, 2, 0])

df.take(sampler) #sampler 의 값을 인덱스로

df.sample(n=3) # 3개의 인덱스 랜덤하게 추출

choices = pd.Series([5, 7, -1, 6, 4])
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

draws = choices.sample(n=10, replace=True)
draws

4    4
4    4
0    5
1    7
1    7
0    5
1    7
1    7
3    6
1    7
dtype: int64

onehot 인코딩¶

문자를 숫자로 바꾸어 주는 방법 중 하나로 onehot인코딩이 있다.
가변수로 만들어주는 것인데, 이는 0과 1로 이루어진 열을 나타낸다.
1은 있다 0은 없다를 나타낸다.

df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

pd.get_dummies(df['key']) #df['key]열만 get_dummies해줌

dummies = pd.get_dummies(df['key'])
dummies

dummies = pd.get_dummies(df['key'],prefix='key') # prefix를 활용하여 좀 더 명시적으로 표현 
#기존 df의 컬럼을 반영해주기 위해서 작성
dummies

df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None, names=mnames) 
movies[:10]

all_genres=[]
for x in movies.genres:
    all_genres.extend(x.split('|'))
genres=pd.unique(all_genres)
genres
# genres의 장르를 중복없이 나오게 하기

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

zero_matrix=np.zeros((len(movies), len(genres)))
dummies= pd.DataFrame(zero_matrix, columns=genres)
dummies

gen= movies.genres[0]
gen.split('|')
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

dummies

enumerate
: 보통 for문과 함께 사용
: 인덱스 값을 포함하는 enumerate객체를 리턴
: for문처럼 반복되는 구간에서 객체가 현재 어느 위치에 있는지 알려주는 인덱스 값이 필요할 때 사용하면 유용

for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|')) #gen 첫번째 줄에 나와있는 string 
    dummies.iloc[i, indices] = 1 

dummies

pd.set_option('display.max_columns', 25)
#pd.set_option('display.max_row', 10)
movies_windic= movies.join(dummies.add_prefix('Genre_'))
movies_windic.head()

np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

bins=[0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

val= ' a,b, guido '
val.split(',') #공백포함됨

[' a', 'b', ' guido ']

val

' a,b, guido '

val= 'a,b, guido'
val.strip() #양끝 공백 제거

'a,b, guido'

pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

first, second, third= pieces 
first+ '::'+ second+ '::'+ third

'a::b::guido'

first

'a'

pieces

['a', 'b', 'guido']

'::'.join(pieces)

'a::b::guido'

'guido' in val

True

val.index(',')

1

val.find(':')

-1

val.find(',')

1

val.index(':')

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-259-2c016e7367ac> in <module>
----> 1 val.index(':')

ValueError: substring not found

val.count(',')

2

val.replace(',', '::')

'a::b:: guido'

val.replace(',', '')

'ab guido'

np.reshape, np.resize 차이점2, flatten, ravel, copy, view (0)	2021.03.30
np.reshape, np.resize 차이점 (0)	2021.03.30
ndim, shape, size (0)	2021.03.29

파이썬/ sklearn/가장 근사한 방정식 구하기/다항회귀 (0)	2021.11.26
Blending (0)	2021.11.26
마크다운 요약 (0)	2020.09.29
Python_우리나라 아기이름 데이터 분석 (1)	2020.09.15
Python_데이터분석2 (0)	2020.09.15

Blending (0)	2021.11.26
Seaborn, Matplotlib (0)	2020.10.04
Python_우리나라 아기이름 데이터 분석 (1)	2020.09.15
Python_데이터분석2 (0)	2020.09.15
Python_판다스_데이터분석 (0)	2020.09.14

gender	F	M
year
2008	191282	169063
2009	186337	166347
2010	195842	177375
2011	199164	180055
2012	204024	185399
2013	189671	173103
2014	187089	174520
2015	190049	178228
2016	176951	165692
2017	154957	148076
2018	144814	137774
2019	133687	128559
2020	52482	50294

			name	gender	births	year	prop
year	gender
2008	F	0	서연	F	3280	2008	0.017147
		1	민서	F	2873	2008	0.015020
		2	지민	F	2826	2008	0.014774
		3	서현	F	2606	2008	0.013624
		4	서윤	F	2484	2008	0.012986
...	...	...	...	...	...	...	...
2020	M	12597	승민	M	126	2020	0.002505
		12598	시환	M	126	2020	0.002505
		12595	재하	M	126	2020	0.002505
		12596	태준	M	126	2020	0.002505
		12599	서율	M	122	2020	0.002426

name	가연	가영	가온	가윤	가은	가현	강민	건	건우	건호	...	현수	현아	현우	현준	현지	현진	현호	형준	혜원	혜인
year
2008	772.0	737.0	NaN	NaN	1571.0	728.0	NaN	532.0	1703.0	NaN	...	768.0	NaN	1924.0	1615.0	991.0	498.0	NaN	472.0	1007.0	NaN
2009	664.0	642.0	NaN	NaN	1703.0	677.0	519.0	489.0	1784.0	NaN	...	687.0	NaN	1909.0	1691.0	891.0	NaN	NaN	NaN	914.0	NaN
2010	652.0	598.0	NaN	554.0	1651.0	680.0	527.0	NaN	1945.0	531.0	...	701.0	576.0	2064.0	1989.0	822.0	NaN	494.0	NaN	1070.0	NaN
2011	NaN	NaN	NaN	734.0	1420.0	635.0	531.0	NaN	1746.0	NaN	...	583.0	586.0	2032.0	1785.0	687.0	NaN	NaN	NaN	966.0	646.0
2012	640.0	NaN	NaN	720.0	1333.0	659.0	493.0	NaN	2050.0	NaN	...	517.0	NaN	2055.0	1560.0	NaN	NaN	NaN	NaN	725.0	NaN
2013	NaN	NaN	NaN	782.0	1121.0	NaN	454.0	NaN	1577.0	NaN	...	468.0	NaN	1861.0	1426.0	NaN	NaN	NaN	NaN	660.0	NaN
2014	NaN	NaN	NaN	600.0	1044.0	NaN	NaN	485.0	1349.0	NaN	...	NaN	NaN	1784.0	1427.0	NaN	NaN	NaN	NaN	694.0	NaN
2015	NaN	NaN	1123.0	588.0	889.0	NaN	NaN	514.0	1479.0	NaN	...	NaN	NaN	1837.0	1294.0	NaN	NaN	NaN	NaN	561.0	NaN
2016	NaN	NaN	518.0	500.0	899.0	NaN	429.0	462.0	1524.0	NaN	...	NaN	NaN	1486.0	1000.0	NaN	NaN	NaN	NaN	NaN	NaN
2017	NaN	NaN	480.0	NaN	831.0	NaN	NaN	NaN	1297.0	NaN	...	NaN	NaN	1224.0	767.0	NaN	NaN	NaN	NaN	NaN	NaN
2018	NaN	NaN	NaN	NaN	594.0	NaN	NaN	NaN	1287.0	NaN	...	NaN	NaN	1068.0	702.0	NaN	NaN	NaN	NaN	NaN	NaN
2019	NaN	NaN	NaN	NaN	553.0	NaN	NaN	320.0	1404.0	NaN	...	NaN	NaN	903.0	596.0	NaN	NaN	NaN	NaN	NaN	NaN
2020	NaN	NaN	NaN	NaN	173.0	NaN	NaN	135.0	536.0	NaN	...	NaN	NaN	349.0	182.0	NaN	NaN	NaN	NaN	NaN	NaN

name	민준	하준	서연	지우
year
2008	2642.0	NaN	3280.0	2815.0
2009	3103.0	496.0	3514.0	3541.0
2010	3601.0	678.0	3518.0	3762.0
2011	4026.0	1160.0	3111.0	3933.0
2012	3691.0	1210.0	3250.0	3938.0
2013	2769.0	1454.0	3133.0	3749.0
2014	4137.0	2977.0	3334.0	3508.0
2015	3821.0	3053.0	3015.0	3783.0
2016	2959.0	2894.0	2551.0	3270.0
2017	2246.0	2723.0	2075.0	2734.0
2018	2022.0	2627.0	1784.0	2572.0
2019	1648.0	2309.0	1406.0	2320.0
2020	586.0	858.0	511.0	901.0

gender	F	M
year
2008	0.578999	0.527945
2009	0.579783	0.538014
2010	0.571665	0.539687
2011	0.570670	0.551487
2012	0.574947	0.552797
2013	0.582466	0.556796
2014	0.595898	0.576415
2015	0.593721	0.578102
2016	0.588982	0.575888
2017	0.593945	0.582100
2018	0.599030	0.593791
2019	0.599991	0.603443
2020	0.604493	0.611663

	name	gender	births	year	prop
500	민준	M	3601	2010	0.020302
501	지훈	M	2106	2010	0.011873
502	예준	M	2076	2010	0.011704
503	현우	M	2064	2010	0.011636
504	지호	M	2058	2010	0.011603
...	...	...	...	...	...
595	도영	M	482	2010	0.002717
596	승훈	M	481	2010	0.002712
597	재훈	M	479	2010	0.002700
598	상현	M	478	2010	0.002695
599	은우	M	473	2010	0.002667

last_letter	우	준	현
year
2008	0.098378	0.095745	0.088730
2009	0.106945	0.103008	0.077567
2010	0.112214	0.112383	0.072896
2011	0.113082	0.118197	0.073600
2012	0.120518	0.116748	0.074191

gender	F	M
year
2016	0.337038	0.662962
2017	0.303000	0.697000
2018	0.328807	0.671193
2019	0.288462	0.711538
2020	0.269565	0.730435

	name	gender	births
0	Mary	F	7065
1	Anna	F	2604
2	Emma	F	2003
3	Elizabeth	F	1939
4	Minnie	F	1746
...	...	...	...
1995	Woodie	M	5
1996	Worthy	M	5
1997	Wright	M	5
1998	York	M	5
1999	Zachariah	M	5

	name	gender	births	year
0	Mary	F	7065	1880
1	Anna	F	2604	1880
2	Emma	F	2003	1880
3	Elizabeth	F	1939	1880
4	Minnie	F	1746	1880
...	...	...	...	...
1690779	Zymaire	M	5	2010
1690780	Zyonne	M	5	2010
1690781	Zyquarius	M	5	2010
1690782	Zyran	M	5	2010
1690783	Zzyzx	M	5	2010

name	Aaden	Aaliyah	Aarav	Aaron	Aarush	Ab	Abagail	Abb	Abbey	Abbie	...	Zoa	Zoe	Zoey	Zoie	Zola	Zollie	Zona	Zora	Zula	Zuri
year
1880	NaN	NaN	NaN	102.0	NaN	NaN	NaN	NaN	NaN	71.0	...	8.0	23.0	NaN	NaN	7.0	NaN	8.0	28.0	27.0	NaN
1881	NaN	NaN	NaN	94.0	NaN	NaN	NaN	NaN	NaN	81.0	...	NaN	22.0	NaN	NaN	10.0	NaN	9.0	21.0	27.0	NaN
1882	NaN	NaN	NaN	85.0	NaN	NaN	NaN	NaN	NaN	80.0	...	8.0	25.0	NaN	NaN	9.0	NaN	17.0	32.0	21.0	NaN
1883	NaN	NaN	NaN	105.0	NaN	NaN	NaN	NaN	NaN	79.0	...	NaN	23.0	NaN	NaN	10.0	NaN	11.0	35.0	25.0	NaN
1884	NaN	NaN	NaN	97.0	NaN	NaN	NaN	NaN	NaN	98.0	...	13.0	31.0	NaN	NaN	14.0	6.0	8.0	58.0	27.0	NaN

	name	gender	births	year	prop
260877	Jacob	M	21875	2010	0.011523
260878	Ethan	M	17866	2010	0.009411
260879	Michael	M	17133	2010	0.009025
260880	Jayden	M	17030	2010	0.008971
260881	William	M	16870	2010	0.008887
...	...	...	...	...	...
261872	Camilo	M	194	2010	0.000102
261873	Destin	M	194	2010	0.000102
261874	Jaquan	M	194	2010	0.000102
261875	Jaydan	M	194	2010	0.000102
261876	Maxton	M	193	2010	0.000102

gender	F			M
year	1910	1960	2010	1910	1960	2010
last_letter
a	108376.0	691247.0	670605.0	977.0	5204.0	28438.0
b	NaN	694.0	450.0	411.0	3912.0	38859.0
c	5.0	49.0	946.0	482.0	15476.0	23125.0
d	6750.0	3729.0	2607.0	22111.0	262112.0	44398.0
e	133569.0	435013.0	313833.0	28655.0	178823.0	129012.0

gender	F			M
year	1910	1960	2010	1910	1960	2010
last_letter
a	0.273390	0.341853	0.381240	0.005031	0.002440	0.014980
b	NaN	0.000343	0.000256	0.002116	0.001834	0.020470
c	0.000013	0.000024	0.000538	0.002482	0.007257	0.012181
d	0.017028	0.001844	0.001482	0.113858	0.122908	0.023387
e	0.336941	0.215133	0.178415	0.147556	0.083853	0.067959
f	NaN	0.000010	0.000055	0.000783	0.004325	0.001188
g	0.000144	0.000157	0.000374	0.002250	0.009488	0.001404
h	0.051529	0.036224	0.075852	0.045562	0.037907	0.051670
i	0.001526	0.039965	0.031734	0.000844	0.000603	0.022628
j	NaN	NaN	0.000090	NaN	NaN	0.000769
k	0.000121	0.000156	0.000356	0.036581	0.049384	0.018541
l	0.043189	0.033867	0.026356	0.065016	0.104904	0.070367
m	0.001201	0.008613	0.002588	0.058044	0.033827	0.024657
n	0.079240	0.130687	0.140210	0.143415	0.152522	0.362771
o	0.001660	0.002439	0.001243	0.017065	0.012829	0.042681
p	0.000018	0.000023	0.000020	0.003172	0.005675	0.001269
q	NaN	NaN	0.000030	NaN	NaN	0.000180
r	0.013390	0.006764	0.018025	0.064481	0.031034	0.087477
s	0.039042	0.012764	0.013332	0.130815	0.102730	0.065145
t	0.027438	0.015201	0.007830	0.072879	0.065655	0.022861
u	0.000684	0.000574	0.000417	0.000124	0.000057	0.001221
v	NaN	0.000060	0.000117	0.000113	0.000037	0.001434
w	0.000020	0.000031	0.001182	0.006329	0.007711	0.016148
x	0.000015	0.000037	0.000727	0.003965	0.001851	0.008614
y	0.110972	0.152569	0.116828	0.077349	0.160987	0.058168
z	0.002439	0.000659	0.000704	0.000170	0.000184	0.001831

last_letter	d	n	y
year
1880	0.083055	0.153213	0.075760
1881	0.083247	0.153214	0.077451
1882	0.085340	0.149560	0.077537
1883	0.084066	0.151646	0.079144
1884	0.086120	0.149915	0.080405

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	AS
1	Albania	89	132	54	4.9	EU
2	Algeria	25	0	14	0.7	AF
3	Andorra	245	138	312	12.4	EU
4	Angola	217	57	45	5.9	AF
5	Antigua & Barbuda	102	128	45	4.9	NaN
6	Argentina	193	25	221	8.3	SA
7	Armenia	21	179	11	3.8	EU
8	Australia	261	72	212	10.4	OC
9	Austria	279	75	191	9.7	EU

	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol
count	193.000000	193.000000	193.000000	193.000000
mean	106.160622	80.994819	49.450777	4.717098
std	101.143103	88.284312	79.697598	3.773298
min	0.000000	0.000000	0.000000	0.000000
25%	20.000000	4.000000	1.000000	1.300000
50%	76.000000	56.000000	8.000000	4.200000
75%	188.000000	128.000000	59.000000	7.200000
max	376.000000	438.000000	370.000000	14.400000

	mean	min	max	sum
continent
AF	16.339623	0	152	866
AS	60.840909	0	326	2677
EU	132.555556	0	373	5965
OC	58.437500	0	254	935
OT	165.739130	68	438	3812
SA	114.750000	25	302	1377

	count	mean	min	max	sum	var	std
continent
AF	53	16.339623	0	152	866	789.767054	28.102794
AS	44	60.840909	0	326	2677	7116.974101	84.362160
EU	45	132.555556	0	373	5965	6020.070707	77.589115
OC	16	58.437500	0	254	935	4970.929167	70.504817
OT	23	165.739130	68	438	3812	9023.837945	94.993884
SA	12	114.750000	25	302	1377	5940.931818	77.077440

	count	mean	min	max	sum	var	std
continent
AF	53	61.471698	0	376	3258	6489.561684	80.557816
AS	44	37.045455	0	247	1630	2447.253700	49.469725
EU	45	193.777778	0	361	8720	9926.449495	99.631569
OC	16	89.687500	0	306	1435	9339.562500	96.641412
OT	23	145.434783	1	285	3345	6339.529644	79.621163
SA	12	175.083333	93	333	2101	4256.628788	65.242845

	count	mean	min	max	sum	var	std
continent
AF	53	16.264151	0	233	862	1509.044267	38.846419
AS	44	9.068182	0	123	399	469.460359	21.667034
EU	45	142.222222	0	370	6400	9490.994949	97.421738
OC	16	35.625000	0	212	570	4167.450000	64.555790
OT	23	24.521739	1	100	564	798.988142	28.266378
SA	12	62.416667	1	221	749	7853.537879	88.620189

	count	mean	min	max	sum	var	std
continent
AF	53	3.007547	0.0	9.1	159.4	7.009557	2.647557
AS	44	2.170455	0.0	11.5	95.5	7.674223	2.770239
EU	45	8.617778	0.0	14.4	387.8	11.279222	3.358455
OC	16	3.381250	0.0	10.4	54.1	11.193625	3.345688
OT	23	5.995652	2.2	11.9	137.9	5.804980	2.409353
SA	12	6.308333	3.8	8.3	75.7	2.344470	1.531166

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
2	Algeria	25	0	14	0.7	AF
4	Angola	217	57	45	5.9	AF
18	Benin	34	4	13	1.1	AF
22	Botswana	173	35	35	5.4	AF
26	Burkina Faso	25	7	7	4.3	AF

	country	alcohol_rate
63	Gambia	0.266667
153	Sierra Leone	0.223333
124	Nigeria	0.185714
179	Uganda	0.153704
142	Rwanda	0.151111

	구분	총계	총계 남	총계 여	중구 계	중구 남	중구 여	동구 계	동구 남	동구 여	...	북구 여	수성구 계	수성구 남	수성구 여	달서구 계	달서구 남	달서구 여	달성군 계	달성군 남	달성군 여
0	0세	14568	7488	7080	511	275	236	2263	1161	1102	...	1356	1784	911	873	3215	1719	1496	2724	1374	1350
1	1세	17475	8947	8528	609	309	300	2755	1410	1345	...	1670	2264	1170	1094	3818	2017	1801	3063	1543	1520
2	2세	19562	9985	9577	612	308	304	3000	1467	1533	...	1847	2684	1410	1274	4423	2317	2106	3327	1668	1659
3	3세	20123	10249	9874	648	315	333	3146	1656	1490	...	1856	2876	1423	1453	4661	2366	2295	3253	1638	1615
4	4세	19794	10069	9725	556	287	269	2971	1519	1452	...	1846	3089	1596	1493	4529	2294	2235	3036	1512	1524
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
96	96세	365	75	290	23	3	20	66	10	56	...	37	73	15	58	64	12	52	31	9	22
97	97세	235	45	190	18	3	15	35	13	22	...	36	43	7	36	39	6	33	16	3	13
98	98세	190	32	158	16	7	9	33	6	27	...	30	29	4	25	25	6	19	15	0	15
99	99세	112	26	86	6	1	5	25	7	18	...	13	17	4	13	19	2	17	8	2	6
100	100세 이상	614	136	478	88	15	73	106	23	83	...	52	97	24	73	68	13	55	26	8	18

	중구 계	동구 계	서구 계	남구 계	북구 계	수성구 계	달서구 계	달성군 계
0	511	2263	719	586	2766	1784	3215	2724
1	609	2755	887	718	3361	2264	3818	3063
2	612	3000	1010	776	3730	2684	4423	3327
3	648	3146	949	758	3832	2876	4661	3253
4	556	2971	1014	820	3779	3089	4529	3036
...	...	...	...	...	...	...	...	...
96	23	66	30	29	49	73	64	31
97	18	35	18	24	42	43	39	16
98	16	33	15	23	34	29	25	15
99	6	25	10	11	16	17	19	8
100	88	106	89	72	68	97	68	26

Python

1. 넘파이를 쓰는 이유

2. 간단한 기능

3. ndarray

4. linspace와 arange 차이점

5. Normal Distributions

'Python > Numpy' 카테고리의 다른 글

Matplotlib

다양한 막대그래프

'Python' 카테고리의 다른 글

'Python' 카테고리의 다른 글

연도별/성별에 따른 선호하는 이름 100개 추출¶

상위 100개의 이름데이터를 남자(boys)와 여자(girls)로 분리¶

'Python' 카테고리의 다른 글

연도별 / 성별에 따른 선호하는 이름 1000개 추출¶

상위 1000개의 이름데이터를 남자(boys)와 여자(grils)로 분리¶

마지막 글자의 변화¶

'Python' 카테고리의 다른 글

read_csv로 데이터를 Dataframe형태로 불러옴¶

correlation : 각 숫자형 변수들 간의 상관관계를 나타낸 계수¶

scatterplot(산점도) : 탐색적 분석에서 필수적으로 사용하며 각 점들을 찍은 그래프(역시 숫자형 변수들만)¶

결측 데이터 전처리¶

시각화¶

total_litres_of_pure_alcohol 평균을 구하기¶

이 평균값보다 적은 알코올을 섭취하는 대륙중에서 spirit을 가장 많이 마시는 국가를 구해보자¶

1)¶

2)¶

total_litres_of_pure_alcohol 평균을 구하기¶

이 평균값보다 적은 알코올을 섭취하는 대륙중에서 spirit을 가장 많이 마시는 국가를 구해보자¶

A[A.column_name.isin(B)]¶

'Python' 카테고리의 다른 글

2018.06.30.기준 대구광역시 주민등록인구 현황¶

2018.12.31.기준 대구광역시 주민등록인구 현황¶

2019.06.30.기준 대구광역시 주민등록인구 현황¶

3개의 기간동안 인구가 증가하는 곳을 찾는다¶

4. 다음조건 찾기¶

컬럼 이름 변경¶

5. 대구광역시의 각 구별 인구수를 정리하여 표시하고 인구가 가장 많은 지역과 가장 적은 지역을 구하여 출력 하라(3개 파일에서 각각 구하라)¶

6. 대구광역시의 각 구별 20대 대비 60세 이상의 인구 비율을 구하여 출력하고 이 비율이 가장 높은 지역을 구하여 출력하라¶

'Python' 카테고리의 다른 글

엑셀파일읽기¶

엑셀파일저장¶

누락된 데이터 처리¶

결측치 채우기¶

def.fillna(value=, method='ffill', axis=0, inplace=, limit=)¶

데이터 변형하기¶

onehot 인코딩¶

'Python' 카테고리의 다른 글

티스토리툴바

	중구 계	동구 계	서구 계	남구 계	북구 계	수성구 계	달서구 계	달성군 계
인구0
0	5674	29075	9719	7797	37401	32508	46276	29751
10	5549	29903	14006	11162	48985	57880	63987	23382
20	11300	43608	25542	20608	61021	58097	82987	29853
30	11396	48971	21529	18912	57670	46528	72962	43404
40	11358	54564	27671	22502	77961	79395	98641	40726
50	12107	59064	38473	26615	75014	74095	104487	38816
60	10243	46790	29106	22642	45549	46530	61842	23984
70	7385	27835	16212	14612	25635	26830	30320	12057
80	2997	10725	5702	6239	9784	12104	12259	5235
90	433	1251	656	729	1054	1404	1291	636
100	88	106	89	72	68	97	68	26

	중구 계	동구 계	서구 계	남구 계	북구 계	수성구 계	달서구 계	달성군 계
인구
0	5674	29075	9719	7797	37401	32508	46276	29751
10	5549	29903	14006	11162	48985	57880	63987	23382
20	11300	43608	25542	20608	61021	58097	82987	29853
30	11396	48971	21529	18912	57670	46528	72962	43404
40	11358	54564	27671	22502	77961	79395	98641	40726
50	12107	59064	38473	26615	75014	74095	104487	38816
60	10243	46790	29106	22642	45549	46530	61842	23984
70	10903	39917	22659	21652	36541	40435	43938	17954

	구분	총계	총계 남	총계 여	중구 계	중구 남	중구 여	동구 계	동구 남	동구 여	...	북구 여	수성구 계	수성구 남	수성구 여	달서구 계	달서구 남	달서구 여	달성군 계	달성군 남	달성군 여
0	0세	13993	7075	6918	521	272	249	2173	1100	1073	...	1318	1685	874	811	2954	1483	1471	2735	1362	1373
1	1세	16306	8442	7864	548	278	270	2569	1365	1204	...	1550	2097	1060	1037	3627	1959	1668	2968	1521	1447
2	2세	18774	9576	9198	656	325	331	2875	1433	1442	...	1791	2484	1313	1171	4160	2185	1975	3282	1624	1658
3	3세	20010	10252	9758	639	324	315	3055	1592	1463	...	1863	2858	1477	1381	4623	2363	2260	3264	1629	1635
4	4세	20020	10219	9801	594	310	284	3055	1541	1514	...	1838	3106	1592	1514	4571	2334	2237	3208	1622	1586

	중구 계	동구 계	서구 계	남구 계	북구 계	수성구 계	달서구 계	달성군 계
인구
0	5744	28571	9287	7520	36774	31851	45262	29944
10	5583	29313	13294	10849	48006	56785	62415	23427
20	11670	43532	24698	20423	60735	57599	82364	29824
30	11653	48399	20686	18483	57192	45823	72346	43396
40	11368	53666	26316	21908	76298	77586	96358	40770
50	12107	59146	37819	26357	75980	74581	105563	39590
60	10215	47617	29113	22849	46874	47295	63772	25015
70	11061	41047	23159	22112	37630	41239	45333	18577

	행정구역	총계	총계 남	총계 여	중구 계	중구 남	중구 여	동구 계	동구 남	동구 여	...	북구 여	수성구 계	수성구 남	수성구 여	달서구 계	달서구 남	달서구 여	달성군 계	달성군 남	달성군 여
0	0세	13085	6754	6331	456	228	228	2019	1021	998	...	1221	1565	839	726	2771	1413	1358	2660	1358	1302
1	1세	15384	7854	7530	524	273	251	2401	1235	1166	...	1434	1964	986	978	3370	1780	1590	2885	1429	1456
2	2세	17580	9026	8554	588	293	295	2718	1429	1289	...	1682	2387	1217	1170	3836	2028	1808	3138	1579	1559
3	3세	19659	10028	9631	625	325	300	2947	1442	1505	...	1848	2839	1501	1338	4480	2313	2167	3362	1701	1661
4	4세	20230	10294	9936	628	297	331	3103	1620	1483	...	1858	3069	1536	1533	4710	2388	2322	3260	1641	1619

	중구 계	동구 계	서구 계	남구 계	북구 계	수성구 계	달서구 계	달성군 계
인구
0	5572	27937	8765	7221	36116	31376	44236	30293
10	5370	28629	12665	10462	46834	55090	60556	23621
20	11620	42833	23711	20327	60691	56987	81862	29765
30	11311	47415	19635	17871	56384	45226	71171	43412
40	11108	52960	25199	21400	75095	76047	94565	41490
50	11798	58640	36882	26010	76729	74962	105716	40356
60	10069	48272	29378	22886	48297	48484	66056	26081
70	11087	42139	23685	22505	38947	42309	47141	19221

	중구 계	중구 남	중구 여	동구 계	동구 남	동구 여	서구 계	서구 남	서구 여	남구 계	...	북구 여	수성구 계	수성구 남	수성구 여	달서구 계	달서구 남	달서구 여	달성군 계	달성군 남	달성군 여
인구
0	5674	2952	2722	29075	14987	14088	9719	4936	4783	7797	...	18260	32508	16713	15795	46276	23865	22411	29751	15132	14619
10	5549	2801	2748	29903	15815	14088	14006	7406	6600	11162	...	23284	57880	30792	27088	63987	33506	30481	23382	12302	11080
20	11300	5611	5689	43608	23707	19901	25542	14240	11302	20608	...	27737	58097	31464	26633	82987	45010	37977	29853	16533	13320
30	11396	5877	5519	48971	25221	23750	21529	11920	9609	18912	...	28127	46528	22290	24238	72962	37075	35887	43404	22386	21018
40	11358	5868	5490	54564	28261	26303	27671	14669	13002	22502	...	39635	79395	36264	43131	98641	47547	51094	40726	21246	19480
50	12107	6000	6107	59064	28461	30603	38473	19174	19299	26615	...	37602	74095	36270	37825	104487	50800	53687	38816	19554	19262
60	10243	4642	5601	46790	21842	24948	29106	13749	15357	22642	...	23545	46530	21938	24592	61842	30114	31728	23984	11985	11999
70	10903	4343	6560	39917	16048	23869	22659	9023	13636	21652	...	22217	40435	16218	24217	43938	17326	26612	17954	7004	10950
총합	78530	38094	40436	351892	174342	177550	188705	95117	93588	151890	...	220407	435468	211949	223519	575120	285243	289877	247870	126142	121728

	중구 계	중구 남	중구 여	동구 계	동구 남	동구 여	서구 계	서구 남	서구 여	남구 계	...	북구 여	수성구 계	수성구 남	수성구 여	달서구 계	달서구 남	달서구 여	달성군 계	달성군 남	달성군 여
인구
0	5744	2978	2766	28571	14765	13806	9287	4707	4580	7520	...	17984	31851	16415	15436	45262	23290	21972	29944	15194	14750
10	5583	2815	2768	29313	15439	13874	13294	7028	6266	10849	...	22866	56785	30245	26540	62415	32564	29851	23427	12319	11108
20	11670	5764	5906	43532	23636	19896	24698	13701	10997	20423	...	27621	57599	31132	26467	82364	44525	37839	29824	16481	13343
30	11653	6028	5625	48399	25034	23365	20686	11520	9166	18483	...	27786	45823	21936	23887	72346	36903	35443	43396	22413	20983
40	11368	5873	5495	53666	27768	25898	26316	14011	12305	21908	...	38825	77586	35471	42115	96358	46516	49842	40770	21276	19494
50	12107	5988	6119	59146	28535	30611	37819	18840	18979	26357	...	38142	74581	36489	38092	105563	51225	54338	39590	19897	19693
60	10215	4679	5536	47617	22270	25347	29113	13742	15371	22849	...	24122	47295	22356	24939	63772	31093	32679	25015	12497	12518
70	11061	4373	6688	41047	16508	24539	23159	9247	13912	22112	...	22836	41239	16577	24662	45333	18011	27322	18577	7297	11280
총합	79401	38498	40903	351291	173955	177336	184372	92796	91576	150501	...	220182	432759	210621	222138	573413	284127	289286	250543	127374	123169

	중구 계	중구 남	중구 여	동구 계	동구 남	동구 여	서구 계	서구 남	서구 여	남구 계	...	북구 여	수성구 계	수성구 남	수성구 여	달서구 계	달서구 남	달서구 여	달성군 계	달성군 남	달성군 여
인구
0	5572	2855	2717	27937	14401	13536	8765	4443	4322	7221	...	17605	31376	16144	15232	44236	22726	21510	30293	15404	14889
10	5370	2705	2665	28629	15069	13560	12665	6683	5982	10462	...	22372	55090	29299	25791	60556	31529	29027	23621	12396	11225
20	11620	5714	5906	42833	23142	19691	23711	13079	10632	20327	...	27671	56987	30778	26209	81862	44109	37753	29765	16427	13338
30	11311	5836	5475	47415	24603	22812	19635	10977	8658	17871	...	27225	45226	21812	23414	71171	36515	34656	43412	22487	20925
40	11108	5695	5413	52960	27422	25538	25199	13470	11729	21400	...	38228	76047	34718	41329	94565	45710	48855	41490	21640	19850
50	11798	5825	5973	58640	28290	30350	36882	18430	18452	26010	...	38585	74962	36596	38366	105716	51170	54546	40356	20164	20192
60	10069	4649	5420	48272	22488	25784	29378	13856	15522	22886	...	24784	48484	22900	25584	66056	32130	33926	26081	13063	13018
70	11087	4382	6705	42139	16986	25153	23685	9481	14204	22505	...	23595	42309	17001	25308	47141	18867	28274	19221	7652	11569
총합	77935	37661	40274	348825	172401	176424	179920	90419	89501	148682	...	220065	430481	209248	221233	571303	282756	288547	254239	129233	125006

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	body	home.dest
0	2	1	Mellinger, Miss. Madeleine Violet	female	13.0	0	1	250644	19.5000	NaN	S	NaN	England / Bennington, VT
1	2	1	Wells, Miss. Joan	female	4.0	1	1	29103	23.0000	NaN	S	NaN	Cornwall / Akron, OH
2	2	1	Duran y More, Miss. Florentina	female	30.0	1	0	SC/PARIS 2148	13.8583	NaN	C	NaN	Barcelona, Spain / Havana, Cuba
3	3	0	Scanlan, Mr. James	male	NaN	0	0	36209	7.7250	NaN	Q	NaN	NaN
4	3	1	Bradley, Miss. Bridget Delia	female	22.0	0	0	334914	7.7250	NaN	Q	NaN	Kingwilliamstown, Co Cork, Ireland Glens Falls...