import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

read_csv로 데이터를 Dataframe형태로 불러옴¶

file_path='data/drinks.csv'
drinks=pd.read_csv(file_path)

info()함수를 통해 결측(NaN)값을 확인할 수 있음

print(drinks.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     170 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB
None

drinks.head(10) #디폴트 값은 5

drinks.describe() # 숫자만 describe로 나옴

corr=drinks[['beer_servings', 'wine_servings']].corr(method='pearson') #두 변수 X 와 Y 간의 선형 상관 관계를 계량화한 수치다
print(corr) # beer가 1일

beer_servings  wine_servings
beer_servings       1.000000       0.527172
wine_servings       0.527172       1.000000

cols=['beer_servings','spirit_servings','wine_servings','total_litres_of_pure_alcohol']
corr= drinks[cols].corr(method='pearson')
print(corr)

beer_servings  spirit_servings  wine_servings  \
beer_servings                      1.000000         0.458819       0.527172   
spirit_servings                    0.458819         1.000000       0.194797   
wine_servings                      0.527172         0.194797       1.000000   
total_litres_of_pure_alcohol       0.835839         0.654968       0.667598   

                              total_litres_of_pure_alcohol  
beer_servings                                     0.835839  
spirit_servings                                   0.654968  
wine_servings                                     0.667598  
total_litres_of_pure_alcohol                      1.000000

corr.values

array([[1.        , 0.45881887, 0.52717169, 0.83583863],
       [0.45881887, 1.        , 0.19479705, 0.65496818],
       [0.52717169, 0.19479705, 1.        , 0.66759834],
       [0.83583863, 0.65496818, 0.66759834, 1.        ]])

heatmap[heat(열)+map(지도)]: 데이터들의 배열을 색상으로 표현해주는 그래프

heatmap을 사용하면 두 개의 카테고리 값에 대한 값 변화를 한눈에 알기 쉬움
대용량 데이터도 쉽게 표현 가능

import seaborn as sns
# corr 행렬 히트맵을 시각화합니다.
cols_view = ['beer', 'spirit', 'wine', 'alcohol'] # 그래프 출력을 위한 cols 이름을 축약합니다.
sns.set(font_scale=1.5)
hm = sns.heatmap(corr.values,
            cbar=True, #컬러바 유무
            annot=True, # 각 셀에 숫자 입력 
            fmt='.2f', #둘째자리까지
            square=True,
            annot_kws={'size': 15}, #숫자 크기 조절 
            yticklabels=cols_view,
            xticklabels=cols_view)

plt.tight_layout() # 여백
plt.show()

# 시각화 라이브러리를 이용한 피처간의 scatter plot을 출력합니다.
sns.set(style='whitegrid', context='notebook')
sns.pairplot(drinks[['beer_servings', 'spirit_servings', 
                     'wine_servings', 'total_litres_of_pure_alcohol']], height=2.5)
plt.show()
# 점 하나는 나라 하나

correlation : 각 숫자형 변수들 간의 상관관계를 나타낸 계수¶

1~0.7는 강한 양의 상관관계 ex) y = x
0.7~-1은 강한 음의 상관관계 ex) y= -x
(0에 가까울 수록 두 변수들 간의 상관이 없다)

scatterplot(산점도) : 탐색적 분석에서 필수적으로 사용하며 각 점들을 찍은 그래프(역시 숫자형 변수들만)¶

그래프를 이용하여 각 변수들 간의 상관관계나 모양 같은 것을 봄
y = x 와 같은 상관관계를 쉽게 볼 수 있고, 다른 관계도 추측할 수 있다.

결측 데이터 전처리¶

print(drinks.isnull().sum())
print("----------------------------------------")
print(drinks.dtypes)

country                          0
beer_servings                    0
spirit_servings                  0
wine_servings                    0
total_litres_of_pure_alcohol     0
continent                       23
dtype: int64
----------------------------------------
country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

결측 데이터 채우기

drinks['continent']=drinks['continent'].fillna('OT') #OT=OTHER

print(drinks.isnull().sum())

country                         0
beer_servings                   0
spirit_servings                 0
wine_servings                   0
total_litres_of_pure_alcohol    0
continent                       0
dtype: int64

print(drinks.info()) # 5번 193으로 달라짐

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB
None

labels= drinks['continent'].value_counts().index.tolist() #index를 리스트로 만들고
fracs1=drinks['continent'].value_counts().values.tolist() #continent의 valuses를 리스트로 
explode= (0,0,0,0.25,0,0)
plt.pie(fracs1,explode=explode, labels=labels, autopct='%.0f%%', shadow=True) #shadow=True 그림자 붙이기 
plt.title("NULL Data to 'OT'")
plt.show()

labels= drinks['continent'].value_counts().index.tolist()
print(labels)

['AF', 'EU', 'AS', 'OT', 'OC', 'SA']

fracs1=drinks['continent'].value_counts().values.tolist()
print(fracs1)

[53, 45, 44, 23, 16, 12]

#plt.rcParams.update({'font.size': 15}) #(1)
plt.figure(figsize=(5,5)) #그림크기
#plt.rc('font',size=15) #폰트크기 (2)
explode= (0,0,0,0.2,0,0)
plt.pie(fracs1, #데이터값,values, 
explode=explode, #파이에서 튀어나오는 위치 
labels=labels, 
colors=None, 
autopct='%.1f%%',
pctdistance=0.6, 
shadow=False,
labeldistance=1.1, 
startangle=230, # 90넣으면 반시계방향으로 90도 돔
radius=None, 
counterclock=True, #반시계방향, False하면 위치바뀜 
wedgeprops=None, 
textprops=None, 
center=(0, 0), 
frame=False, 
rotatelabels=False)
plt.title("NULL Data to 'OT'")
plt.show()
#plt.rcParams.update({'font.size': 15}) (1)
plt.rc('font',size=10) #폰트 크기를 원래대로 지정 (2)

https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.pie.html

result=drinks.groupby('continent').spirit_servings.agg(['mean','min','max','sum'])
result.head(10)

result=drinks.groupby('continent').spirit_servings. \
        agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)

result=drinks.groupby('continent').beer_servings. \
        agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)

result=drinks.groupby('continent').wine_servings. \
        agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)

result=drinks.groupby('continent').total_litres_of_pure_alcohol. \
        agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)

total_litres_of_pure_alcohol의 전체평균

total_mean=drinks.total_litres_of_pure_alcohol.mean()
total_mean

4.717098445595855

continent_mean=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
print(continent_mean)

continent
AF    3.007547
AS    2.170455
EU    8.617778
OC    3.381250
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64

continent_over_mean=continent_mean[continent_mean >=total_mean]
print(continent_over_mean)

continent
EU    8.617778
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64

평균 beer_servings가 가장 높은 대륙

beer_continent= drinks.groupby('continent').beer_servings.mean()
beer_continent

continent
AF     61.471698
AS     37.045455
EU    193.777778
OC     89.687500
OT    145.434783
SA    175.083333
Name: beer_servings, dtype: float64

beer_continent_max=beer_continent[beer_continent==beer_continent.max()]
beer_continent_max

continent
EU    193.777778
Name: beer_servings, dtype: float64

beer_continent.idxmax()

'EU'

beer_continent[beer_continent.idxmax()]

193.77777777777777

평균 beer_servings가 가장 낮은 대륙

beer_continent_min=beer_continent[beer_continent==beer_continent.min()]
beer_continent_min

continent
AS    37.045455
Name: beer_servings, dtype: float64

beer_continent.idxmin()

'AS'

beer_continent[beer_continent.idxmin()]

37.04545454545455

시각화¶

대륙별 spirit_servings의 평균,최소,최대를 시각화

result=drinks.groupby('continent').spirit_servings. \
        agg(['mean','min','max','count'])
result.head(10)

number_groups=len(result.index) 
number_groups

6

means= result['mean'].tolist()
means

[16.339622641509433,
 60.84090909090909,
 132.55555555555554,
 58.4375,
 165.7391304347826,
 114.75]

mins= result['min'].tolist()
maxs= result['max'].tolist()
counts= result['count'].tolist() 
index=np.arange(number_groups)

plt.figure(figsize=(10,5)) #그림크기
bar_width= 0.2 #0.8 디폴트 

rects_1= plt.bar(index, means, bar_width, color='r', label='Mean')
rects_2= plt.bar(index+bar_width, mins, bar_width, color='g', label='Min')
rects_3= plt.bar(index+bar_width*2, maxs, bar_width, color='b', label='Max')
rects_4= plt.bar(index+bar_width*3, counts, bar_width, color='y', label='Count')

plt.xticks(index, result.index.tolist())
plt.legend()
plt.show()

https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.bar.html?highlight=bar#matplotlib.pyplot.bar

대륙별 total_litres_of_pure_alcohol 시각화

continent_mean=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
continent_mean

continent
AF    3.007547
AS    2.170455
EU    8.617778
OC    3.381250
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64

continents= continent_mean.index.tolist()
continents

['AF', 'AS', 'EU', 'OC', 'OT', 'SA']

continents.append('MEAN')
continents

['AF', 'AS', 'EU', 'OC', 'OT', 'SA', 'MEAN']

x_pos=np.arange(len(continents)) #그래프 세로
x_pos

array([0, 1, 2, 3, 4, 5, 6])

alcohol= continent_mean.tolist()
alcohol.append(total_mean)
alcohol

[3.00754716981132,
 2.1704545454545454,
 8.617777777777777,
 3.3812500000000005,
 5.995652173913044,
 6.308333333333334,
 4.717098445595855]

total_mean

4.717098445595855

#plt.grid(b=True, which='both', axis='both')
bar_list = plt.bar(x_pos, alcohol, align='center', alpha=0.5) 
bar_list[len(continents) - 1].set_color('r') # 맨 마지막을 빨간색으로 
plt.plot([0., 6], [total_mean, total_mean], "k--") #점선 그리기 
plt.xticks(x_pos, continents)

plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by Continent')

plt.show()

#plt.grid(b=True, which='both', axis='both')
bar_list = plt.bar(x_pos, alcohol, align='center', alpha=0.5) 
bar_list[len(continents) - 1].set_color('g') 
plt.plot([0., 6], [total_mean, total_mean], "k--") #점선 그리기 
plt.xticks(x_pos, continents)

plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by Continent')

plt.show()

continent_mean

continent
AF    3.007547
AS    2.170455
EU    8.617778
OC    3.381250
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64

#plt.grid(b=True, which='both', axis='both')
bar_list = plt.bar(x_pos, alcohol, align='center', alpha=0.5) 
bar_list[len(continents) - 1].set_color('g') 
plt.plot([3., 5], [continent_mean['SA'], continent_mean['SA']], "k--") #점선 그리기 
plt.xticks(x_pos, continents)

plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by Continent')

plt.show()

https://matplotlib.org/3.3.1/api/_as_gen/matplotlib.pyplot.plot.html

!pip install scipy

Requirement already satisfied: scipy in c:\users\w\anaconda3\lib\site-packages (1.5.0)
Requirement already satisfied: numpy>=1.14.5 in c:\users\w\anaconda3\lib\site-packages (from scipy) (1.18.5)

africa=drinks.loc[drinks['continent']=='AF']
africa.head()

europe=drinks.loc[drinks['continent']=='EU']
europe.head()

drinks.head()

drinks['total_servings']=drinks['beer_servings']+ drinks['wine_servings']+ drinks['spirit_servings']

drinks['alcohol_rate']= drinks['total_litres_of_pure_alcohol'] / drinks['total_servings']

drinks['alcohol_rate']= drinks['alcohol_rate'].fillna(0)

drinks.head()

country_rank= drinks[['country','alcohol_rate']]
country_rank= country_rank.sort_values(by=['alcohol_rate'], ascending= 0)
country_rank.head()

country_list = country_rank.country.tolist()
x_pos = np.arange(len(country_list))
rank = country_rank.alcohol_rate.tolist()
 
bar_list = plt.bar(x_pos, rank)
bar_list[country_list.index("South Korea")].set_color('r')
plt.ylabel('alcohol rate')
plt.title('liquor drink rank by contry')
plt.axis([0, 200, 0, 0.3])

korea_rank = country_list.index("South Korea")
korea_alc_rate = country_rank[country_rank['country'] == 'South Korea']['alcohol_rate'].values[0]
plt.annotate('South Korea : ' + str(korea_rank + 1), 
             xy=(korea_rank, korea_alc_rate), 
             xytext=(korea_rank + 10, korea_alc_rate + 0.05),
             arrowprops=dict(facecolor='red', shrink=0.05))

plt.show()

total_litres_of_pure_alcohol 평균을 구하기¶

이 평균값보다 적은 알코올을 섭취하는 대륙중에서 spirit을 가장 많이 마시는 국가를 구해보자¶

total_mean=drinks.total_litres_of_pure_alcohol.mean()
total_mean

4.717098445595855

continent_sum=drinks.groupby('continent').spirit_servings.agg(['sum'])
continent_sum

continent_sum['mean']=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
continent_sum

continent_sum[continent_mean<total_mean]

continent_sum[continent_mean<total_mean].loc[:,'sum'].idxmax()

'AS'

1)¶

a= drinks['continent'].str.contains('AS|AF|OC')
print(a)

0       True
1      False
2       True
3      False
4       True
       ...  
188    False
189     True
190     True
191     True
192     True
Name: continent, Length: 193, dtype: bool

b = drinks[a]
print(b)

country  beer_servings  spirit_servings  wine_servings  \
0    Afghanistan              0                0              0   
2        Algeria             25                0             14   
4         Angola            217               57             45   
8      Australia            261               72            212   
12       Bahrain             42               63              7   
..           ...            ...              ...            ...   
187      Vanuatu             21               18             11   
189      Vietnam            111                2              1   
190        Yemen              6                0              0   
191       Zambia             32               19              4   
192     Zimbabwe             64               18              4   

     total_litres_of_pure_alcohol continent  total_servings  alcohol_rate  
0                             0.0        AS               0      0.000000  
2                             0.7        AF              39      0.017949  
4                             5.9        AF             319      0.018495  
8                            10.4        OC             545      0.019083  
12                            2.0        AS             112      0.017857  
..                            ...       ...             ...           ...  
187                           0.9        OC              50      0.018000  
189                           2.0        AS             114      0.017544  
190                           0.1        AS               6      0.016667  
191                           2.5        AF              55      0.045455  
192                           4.7        AF              86      0.054651  

[113 rows x 8 columns]

spirit_continent_max = b[b['spirit_servings'] == b['spirit_servings'].max()]
spirit_continent_max

2)¶

a = drinks[drinks['continent'].isin(['AS', 'AF', 'OC'])]
a

spirit_continent_max = a[a['spirit_servings'] == a['spirit_servings'].max()]
spirit_continent_max

drinks

total_litres_of_pure_alcohol 평균을 구하기¶

이 평균값보다 적은 알코올을 섭취하는 대륙중에서 spirit을 가장 많이 마시는 국가를 구해보자¶

drinks

total_mean=drinks.total_litres_of_pure_alcohol.mean()
total_mean

4.717098445595855

continent_mean=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
continent_mean

continent
AF    3.007547
AS    2.170455
EU    8.617778
OC    3.381250
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64

 continent_under_mean= continent_mean[continent_mean <= total_mean].index.tolist()
 continent_under_mean

['AF', 'AS', 'OC']

A[A.column_name.isin(B)]¶

A데이터 프레임에서 A column_name 피처가 B안에 포함되어 있는 데이터만 가져옴

df_continent_under_mean= drinks.loc[drinks.continent.isin(continent_under_mean)] 
df_continent_under_mean

most_spirit_under_mean= df_continent_under_mean.loc[df_continent_under_mean['spirit_servings'].idxmax()]
most_spirit_under_mean

country                         Russian Federation
beer_servings                                  247
spirit_servings                                326
wine_servings                                   73
total_litres_of_pure_alcohol                  11.5
continent                                       AS
total_servings                                 646
alcohol_rate                             0.0178019
Name: 141, dtype: object

	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol
count	193.000000	193.000000	193.000000	193.000000
mean	106.160622	80.994819	49.450777	4.717098
std	101.143103	88.284312	79.697598	3.773298
min	0.000000	0.000000	0.000000	0.000000
25%	20.000000	4.000000	1.000000	1.300000
50%	76.000000	56.000000	8.000000	4.200000
75%	188.000000	128.000000	59.000000	7.200000
max	376.000000	438.000000	370.000000	14.400000

	count	mean	min	max	sum	var	std
continent
AF	53	16.264151	0	233	862	1509.044267	38.846419
AS	44	9.068182	0	123	399	469.460359	21.667034
EU	45	142.222222	0	370	6400	9490.994949	97.421738
OC	16	35.625000	0	212	570	4167.450000	64.555790
OT	23	24.521739	1	100	564	798.988142	28.266378
SA	12	62.416667	1	221	749	7853.537879	88.620189

	country	alcohol_rate
63	Gambia	0.266667
153	Sierra Leone	0.223333
124	Nigeria	0.185714
179	Uganda	0.153704
142	Rwanda	0.151111

Python_우리나라 아기이름 데이터 분석 (1)	2020.09.15
Python_데이터분석2 (0)	2020.09.15
Python_example (0)	2020.09.11
python_pandas(판다스): 계층적 색인 지정, 누락된 데이터처리, 결측치채우기, 데이터 변형하기, onehot인코딩 (0)	2020.09.11
Python_pandas 문제 (0)	2020.09.09

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

.

Python_판다스_데이터분석

read_csv로 데이터를 Dataframe형태로 불러옴¶

correlation : 각 숫자형 변수들 간의 상관관계를 나타낸 계수¶

scatterplot(산점도) : 탐색적 분석에서 필수적으로 사용하며 각 점들을 찍은 그래프(역시 숫자형 변수들만)¶

결측 데이터 전처리¶

시각화¶

total_litres_of_pure_alcohol 평균을 구하기¶

이 평균값보다 적은 알코올을 섭취하는 대륙중에서 spirit을 가장 많이 마시는 국가를 구해보자¶

1)¶

2)¶

total_litres_of_pure_alcohol 평균을 구하기¶

이 평균값보다 적은 알코올을 섭취하는 대륙중에서 spirit을 가장 많이 마시는 국가를 구해보자¶

A[A.column_name.isin(B)]¶

'Python' 카테고리의 다른 글

+ Recent posts

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
0	Afghanistan	0	0	0	0.0	AS
1	Albania	89	132	54	4.9	EU
2	Algeria	25	0	14	0.7	AF
3	Andorra	245	138	312	12.4	EU
4	Angola	217	57	45	5.9	AF
5	Antigua & Barbuda	102	128	45	4.9	NaN
6	Argentina	193	25	221	8.3	SA
7	Armenia	21	179	11	3.8	EU
8	Australia	261	72	212	10.4	OC
9	Austria	279	75	191	9.7	EU

	mean	min	max	sum
continent
AF	16.339623	0	152	866
AS	60.840909	0	326	2677
EU	132.555556	0	373	5965
OC	58.437500	0	254	935
OT	165.739130	68	438	3812
SA	114.750000	25	302	1377

	count	mean	min	max	sum	var	std
continent
AF	53	16.339623	0	152	866	789.767054	28.102794
AS	44	60.840909	0	326	2677	7116.974101	84.362160
EU	45	132.555556	0	373	5965	6020.070707	77.589115
OC	16	58.437500	0	254	935	4970.929167	70.504817
OT	23	165.739130	68	438	3812	9023.837945	94.993884
SA	12	114.750000	25	302	1377	5940.931818	77.077440

	count	mean	min	max	sum	var	std
continent
AF	53	61.471698	0	376	3258	6489.561684	80.557816
AS	44	37.045455	0	247	1630	2447.253700	49.469725
EU	45	193.777778	0	361	8720	9926.449495	99.631569
OC	16	89.687500	0	306	1435	9339.562500	96.641412
OT	23	145.434783	1	285	3345	6339.529644	79.621163
SA	12	175.083333	93	333	2101	4256.628788	65.242845

	count	mean	min	max	sum	var	std
continent
AF	53	3.007547	0.0	9.1	159.4	7.009557	2.647557
AS	44	2.170455	0.0	11.5	95.5	7.674223	2.770239
EU	45	8.617778	0.0	14.4	387.8	11.279222	3.358455
OC	16	3.381250	0.0	10.4	54.1	11.193625	3.345688
OT	23	5.995652	2.2	11.9	137.9	5.804980	2.409353
SA	12	6.308333	3.8	8.3	75.7	2.344470	1.531166

	country	beer_servings	spirit_servings	wine_servings	total_litres_of_pure_alcohol	continent
2	Algeria	25	0	14	0.7	AF
4	Angola	217	57	45	5.9	AF
18	Benin	34	4	13	1.1	AF
22	Botswana	173	35	35	5.4	AF
26	Burkina Faso	25	7	7	4.3	AF

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`