200914
In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

read_csv로 데이터를 Dataframe형태로 불러옴

In [2]:
file_path='data/drinks.csv'
drinks=pd.read_csv(file_path)
  • info()함수를 통해 결측(NaN)값을 확인할 수 있음
In [3]:
print(drinks.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     170 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB
None
In [4]:
drinks.head(10) #디폴트 값은 5
Out[4]:
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 AS
1 Albania 89 132 54 4.9 EU
2 Algeria 25 0 14 0.7 AF
3 Andorra 245 138 312 12.4 EU
4 Angola 217 57 45 5.9 AF
5 Antigua & Barbuda 102 128 45 4.9 NaN
6 Argentina 193 25 221 8.3 SA
7 Armenia 21 179 11 3.8 EU
8 Australia 261 72 212 10.4 OC
9 Austria 279 75 191 9.7 EU
In [5]:
drinks.describe() # 숫자만 describe로 나옴 
Out[5]:
beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol
count 193.000000 193.000000 193.000000 193.000000
mean 106.160622 80.994819 49.450777 4.717098
std 101.143103 88.284312 79.697598 3.773298
min 0.000000 0.000000 0.000000 0.000000
25% 20.000000 4.000000 1.000000 1.300000
50% 76.000000 56.000000 8.000000 4.200000
75% 188.000000 128.000000 59.000000 7.200000
max 376.000000 438.000000 370.000000 14.400000
In [6]:
corr=drinks[['beer_servings', 'wine_servings']].corr(method='pearson') #두 변수 X 와 Y 간의 선형 상관 관계를 계량화한 수치다
print(corr) # beer가 1일
beer_servings  wine_servings
beer_servings       1.000000       0.527172
wine_servings       0.527172       1.000000
In [7]:
cols=['beer_servings','spirit_servings','wine_servings','total_litres_of_pure_alcohol']
corr= drinks[cols].corr(method='pearson')
print(corr)
beer_servings  spirit_servings  wine_servings  \
beer_servings                      1.000000         0.458819       0.527172   
spirit_servings                    0.458819         1.000000       0.194797   
wine_servings                      0.527172         0.194797       1.000000   
total_litres_of_pure_alcohol       0.835839         0.654968       0.667598   

                              total_litres_of_pure_alcohol  
beer_servings                                     0.835839  
spirit_servings                                   0.654968  
wine_servings                                     0.667598  
total_litres_of_pure_alcohol                      1.000000  
In [8]:
corr.values
Out[8]:
array([[1.        , 0.45881887, 0.52717169, 0.83583863],
       [0.45881887, 1.        , 0.19479705, 0.65496818],
       [0.52717169, 0.19479705, 1.        , 0.66759834],
       [0.83583863, 0.65496818, 0.66759834, 1.        ]])

heatmap[heat(열)+map(지도)]: 데이터들의 배열을 색상으로 표현해주는 그래프

  • heatmap을 사용하면 두 개의 카테고리 값에 대한 값 변화를 한눈에 알기 쉬움
  • 대용량 데이터도 쉽게 표현 가능
In [9]:
import seaborn as sns
# corr 행렬 히트맵을 시각화합니다.
cols_view = ['beer', 'spirit', 'wine', 'alcohol'] # 그래프 출력을 위한 cols 이름을 축약합니다.
sns.set(font_scale=1.5)
hm = sns.heatmap(corr.values,
            cbar=True, #컬러바 유무
            annot=True, # 각 셀에 숫자 입력 
            fmt='.2f', #둘째자리까지
            square=True,
            annot_kws={'size': 15}, #숫자 크기 조절 
            yticklabels=cols_view,
            xticklabels=cols_view)

plt.tight_layout() # 여백
plt.show()
In [10]:
# 시각화 라이브러리를 이용한 피처간의 scatter plot을 출력합니다.
sns.set(style='whitegrid', context='notebook')
sns.pairplot(drinks[['beer_servings', 'spirit_servings', 
                     'wine_servings', 'total_litres_of_pure_alcohol']], height=2.5)
plt.show()
# 점 하나는 나라 하나

correlation : 각 숫자형 변수들 간의 상관관계를 나타낸 계수

  • 1~0.7는 강한 양의 상관관계 ex) y = x
  • 0.7~-1은 강한 음의 상관관계 ex) y= -x
    (0에 가까울 수록 두 변수들 간의 상관이 없다)

scatterplot(산점도) : 탐색적 분석에서 필수적으로 사용하며 각 점들을 찍은 그래프(역시 숫자형 변수들만)

  • 그래프를 이용하여 각 변수들 간의 상관관계나 모양 같은 것을 봄
  • y = x 와 같은 상관관계를 쉽게 볼 수 있고, 다른 관계도 추측할 수 있다.

결측 데이터 전처리

In [11]:
print(drinks.isnull().sum())
print("----------------------------------------")
print(drinks.dtypes)
country                          0
beer_servings                    0
spirit_servings                  0
wine_servings                    0
total_litres_of_pure_alcohol     0
continent                       23
dtype: int64
----------------------------------------
country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object
  • 결측 데이터 채우기
In [12]:
drinks['continent']=drinks['continent'].fillna('OT') #OT=OTHER
In [13]:
print(drinks.isnull().sum())
country                         0
beer_servings                   0
spirit_servings                 0
wine_servings                   0
total_litres_of_pure_alcohol    0
continent                       0
dtype: int64
In [14]:
print(drinks.info()) # 5번 193으로 달라짐
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 9.2+ KB
None
In [15]:
labels= drinks['continent'].value_counts().index.tolist() #index를 리스트로 만들고
fracs1=drinks['continent'].value_counts().values.tolist() #continent의 valuses를 리스트로 
explode= (0,0,0,0.25,0,0)
plt.pie(fracs1,explode=explode, labels=labels, autopct='%.0f%%', shadow=True) #shadow=True 그림자 붙이기 
plt.title("NULL Data to 'OT'")
plt.show()
In [16]:
labels= drinks['continent'].value_counts().index.tolist()
print(labels)
['AF', 'EU', 'AS', 'OT', 'OC', 'SA']
In [17]:
fracs1=drinks['continent'].value_counts().values.tolist()
print(fracs1)
[53, 45, 44, 23, 16, 12]
In [18]:
#plt.rcParams.update({'font.size': 15}) #(1)
plt.figure(figsize=(5,5)) #그림크기
#plt.rc('font',size=15) #폰트크기 (2)
explode= (0,0,0,0.2,0,0)
plt.pie(fracs1, #데이터값,values, 
explode=explode, #파이에서 튀어나오는 위치 
labels=labels, 
colors=None, 
autopct='%.1f%%',
pctdistance=0.6, 
shadow=False,
labeldistance=1.1, 
startangle=230, # 90넣으면 반시계방향으로 90도 돔
radius=None, 
counterclock=True, #반시계방향, False하면 위치바뀜 
wedgeprops=None, 
textprops=None, 
center=(0, 0), 
frame=False, 
rotatelabels=False)
plt.title("NULL Data to 'OT'")
plt.show()
#plt.rcParams.update({'font.size': 15}) (1)
plt.rc('font',size=10) #폰트 크기를 원래대로 지정 (2)
In [19]:
result=drinks.groupby('continent').spirit_servings.agg(['mean','min','max','sum'])
result.head(10)
Out[19]:
mean min max sum
continent
AF 16.339623 0 152 866
AS 60.840909 0 326 2677
EU 132.555556 0 373 5965
OC 58.437500 0 254 935
OT 165.739130 68 438 3812
SA 114.750000 25 302 1377
In [20]:
result=drinks.groupby('continent').spirit_servings. \
        agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)
Out[20]:
count mean min max sum var std
continent
AF 53 16.339623 0 152 866 789.767054 28.102794
AS 44 60.840909 0 326 2677 7116.974101 84.362160
EU 45 132.555556 0 373 5965 6020.070707 77.589115
OC 16 58.437500 0 254 935 4970.929167 70.504817
OT 23 165.739130 68 438 3812 9023.837945 94.993884
SA 12 114.750000 25 302 1377 5940.931818 77.077440
In [21]:
result=drinks.groupby('continent').beer_servings. \
        agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)
Out[21]:
count mean min max sum var std
continent
AF 53 61.471698 0 376 3258 6489.561684 80.557816
AS 44 37.045455 0 247 1630 2447.253700 49.469725
EU 45 193.777778 0 361 8720 9926.449495 99.631569
OC 16 89.687500 0 306 1435 9339.562500 96.641412
OT 23 145.434783 1 285 3345 6339.529644 79.621163
SA 12 175.083333 93 333 2101 4256.628788 65.242845
In [22]:
result=drinks.groupby('continent').wine_servings. \
        agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)
Out[22]:
count mean min max sum var std
continent
AF 53 16.264151 0 233 862 1509.044267 38.846419
AS 44 9.068182 0 123 399 469.460359 21.667034
EU 45 142.222222 0 370 6400 9490.994949 97.421738
OC 16 35.625000 0 212 570 4167.450000 64.555790
OT 23 24.521739 1 100 564 798.988142 28.266378
SA 12 62.416667 1 221 749 7853.537879 88.620189
In [23]:
result=drinks.groupby('continent').total_litres_of_pure_alcohol. \
        agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)
Out[23]:
count mean min max sum var std
continent
AF 53 3.007547 0.0 9.1 159.4 7.009557 2.647557
AS 44 2.170455 0.0 11.5 95.5 7.674223 2.770239
EU 45 8.617778 0.0 14.4 387.8 11.279222 3.358455
OC 16 3.381250 0.0 10.4 54.1 11.193625 3.345688
OT 23 5.995652 2.2 11.9 137.9 5.804980 2.409353
SA 12 6.308333 3.8 8.3 75.7 2.344470 1.531166

total_litres_of_pure_alcohol의 전체평균

In [24]:
total_mean=drinks.total_litres_of_pure_alcohol.mean()
total_mean
Out[24]:
4.717098445595855
In [25]:
continent_mean=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
print(continent_mean)
continent
AF    3.007547
AS    2.170455
EU    8.617778
OC    3.381250
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64
In [26]:
continent_over_mean=continent_mean[continent_mean >=total_mean]
print(continent_over_mean)
continent
EU    8.617778
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64
  • 평균 beer_servings가 가장 높은 대륙
In [27]:
beer_continent= drinks.groupby('continent').beer_servings.mean()
beer_continent
Out[27]:
continent
AF     61.471698
AS     37.045455
EU    193.777778
OC     89.687500
OT    145.434783
SA    175.083333
Name: beer_servings, dtype: float64
In [28]:
beer_continent_max=beer_continent[beer_continent==beer_continent.max()]
beer_continent_max
Out[28]:
continent
EU    193.777778
Name: beer_servings, dtype: float64
In [29]:
beer_continent.idxmax()
Out[29]:
'EU'
In [30]:
beer_continent[beer_continent.idxmax()]
Out[30]:
193.77777777777777
  • 평균 beer_servings가 가장 낮은 대륙
In [31]:
beer_continent_min=beer_continent[beer_continent==beer_continent.min()]
beer_continent_min
Out[31]:
continent
AS    37.045455
Name: beer_servings, dtype: float64
In [32]:
beer_continent.idxmin()
Out[32]:
'AS'
In [33]:
beer_continent[beer_continent.idxmin()]
Out[33]:
37.04545454545455

시각화

  • 대륙별 spirit_servings의 평균,최소,최대를 시각화
In [34]:
result=drinks.groupby('continent').spirit_servings. \
        agg(['mean','min','max','count'])
result.head(10)
Out[34]:
mean min max count
continent
AF 16.339623 0 152 53
AS 60.840909 0 326 44
EU 132.555556 0 373 45
OC 58.437500 0 254 16
OT 165.739130 68 438 23
SA 114.750000 25 302 12
In [35]:
number_groups=len(result.index) 
number_groups
Out[35]:
6
In [36]:
means= result['mean'].tolist()
means
Out[36]:
[16.339622641509433,
 60.84090909090909,
 132.55555555555554,
 58.4375,
 165.7391304347826,
 114.75]
In [37]:
mins= result['min'].tolist()
maxs= result['max'].tolist()
counts= result['count'].tolist() 
index=np.arange(number_groups)

plt.figure(figsize=(10,5)) #그림크기
bar_width= 0.2 #0.8 디폴트 

rects_1= plt.bar(index, means, bar_width, color='r', label='Mean')
rects_2= plt.bar(index+bar_width, mins, bar_width, color='g', label='Min')
rects_3= plt.bar(index+bar_width*2, maxs, bar_width, color='b', label='Max')
rects_4= plt.bar(index+bar_width*3, counts, bar_width, color='y', label='Count')

plt.xticks(index, result.index.tolist())
plt.legend()
plt.show()
  • 대륙별 total_litres_of_pure_alcohol 시각화
In [38]:
continent_mean=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
continent_mean
Out[38]:
continent
AF    3.007547
AS    2.170455
EU    8.617778
OC    3.381250
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64
In [39]:
continents= continent_mean.index.tolist()
continents
Out[39]:
['AF', 'AS', 'EU', 'OC', 'OT', 'SA']
In [40]:
continents.append('MEAN')
continents
Out[40]:
['AF', 'AS', 'EU', 'OC', 'OT', 'SA', 'MEAN']
In [41]:
x_pos=np.arange(len(continents)) #그래프 세로
x_pos
Out[41]:
array([0, 1, 2, 3, 4, 5, 6])
In [42]:
alcohol= continent_mean.tolist()
alcohol.append(total_mean)
alcohol
Out[42]:
[3.00754716981132,
 2.1704545454545454,
 8.617777777777777,
 3.3812500000000005,
 5.995652173913044,
 6.308333333333334,
 4.717098445595855]
In [43]:
total_mean
Out[43]:
4.717098445595855
In [44]:
#plt.grid(b=True, which='both', axis='both')
bar_list = plt.bar(x_pos, alcohol, align='center', alpha=0.5) 
bar_list[len(continents) - 1].set_color('r') # 맨 마지막을 빨간색으로 
plt.plot([0., 6], [total_mean, total_mean], "k--") #점선 그리기 
plt.xticks(x_pos, continents)

plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by Continent')

plt.show()
In [45]:
#plt.grid(b=True, which='both', axis='both')
bar_list = plt.bar(x_pos, alcohol, align='center', alpha=0.5) 
bar_list[len(continents) - 1].set_color('g') 
plt.plot([0., 6], [total_mean, total_mean], "k--") #점선 그리기 
plt.xticks(x_pos, continents)

plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by Continent')

plt.show()
In [46]:
continent_mean
Out[46]:
continent
AF    3.007547
AS    2.170455
EU    8.617778
OC    3.381250
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64
In [47]:
#plt.grid(b=True, which='both', axis='both')
bar_list = plt.bar(x_pos, alcohol, align='center', alpha=0.5) 
bar_list[len(continents) - 1].set_color('g') 
plt.plot([3., 5], [continent_mean['SA'], continent_mean['SA']], "k--") #점선 그리기 
plt.xticks(x_pos, continents)

plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by Continent')

plt.show()
In [48]:
!pip install scipy
Requirement already satisfied: scipy in c:\users\w\anaconda3\lib\site-packages (1.5.0)
Requirement already satisfied: numpy>=1.14.5 in c:\users\w\anaconda3\lib\site-packages (from scipy) (1.18.5)
In [49]:
africa=drinks.loc[drinks['continent']=='AF']
africa.head()
Out[49]:
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
2 Algeria 25 0 14 0.7 AF
4 Angola 217 57 45 5.9 AF
18 Benin 34 4 13 1.1 AF
22 Botswana 173 35 35 5.4 AF
26 Burkina Faso 25 7 7 4.3 AF
In [50]:
europe=drinks.loc[drinks['continent']=='EU']
europe.head()
Out[50]:
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
1 Albania 89 132 54 4.9 EU
3 Andorra 245 138 312 12.4 EU
7 Armenia 21 179 11 3.8 EU
9 Austria 279 75 191 9.7 EU
10 Azerbaijan 21 46 5 1.3 EU
In [51]:
drinks.head()
Out[51]:
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent
0 Afghanistan 0 0 0 0.0 AS
1 Albania 89 132 54 4.9 EU
2 Algeria 25 0 14 0.7 AF
3 Andorra 245 138 312 12.4 EU
4 Angola 217 57 45 5.9 AF
In [52]:
drinks['total_servings']=drinks['beer_servings']+ drinks['wine_servings']+ drinks['spirit_servings']
In [53]:
drinks['alcohol_rate']= drinks['total_litres_of_pure_alcohol'] / drinks['total_servings']

drinks['alcohol_rate']= drinks['alcohol_rate'].fillna(0)
In [54]:
drinks.head()
Out[54]:
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent total_servings alcohol_rate
0 Afghanistan 0 0 0 0.0 AS 0 0.000000
1 Albania 89 132 54 4.9 EU 275 0.017818
2 Algeria 25 0 14 0.7 AF 39 0.017949
3 Andorra 245 138 312 12.4 EU 695 0.017842
4 Angola 217 57 45 5.9 AF 319 0.018495
In [55]:
country_rank= drinks[['country','alcohol_rate']]
country_rank= country_rank.sort_values(by=['alcohol_rate'], ascending= 0)
country_rank.head()
Out[55]:
country alcohol_rate
63 Gambia 0.266667
153 Sierra Leone 0.223333
124 Nigeria 0.185714
179 Uganda 0.153704
142 Rwanda 0.151111
In [56]:
country_list = country_rank.country.tolist()
x_pos = np.arange(len(country_list))
rank = country_rank.alcohol_rate.tolist()
 
bar_list = plt.bar(x_pos, rank)
bar_list[country_list.index("South Korea")].set_color('r')
plt.ylabel('alcohol rate')
plt.title('liquor drink rank by contry')
plt.axis([0, 200, 0, 0.3])

korea_rank = country_list.index("South Korea")
korea_alc_rate = country_rank[country_rank['country'] == 'South Korea']['alcohol_rate'].values[0]
plt.annotate('South Korea : ' + str(korea_rank + 1), 
             xy=(korea_rank, korea_alc_rate), 
             xytext=(korea_rank + 10, korea_alc_rate + 0.05),
             arrowprops=dict(facecolor='red', shrink=0.05))

plt.show()

total_litres_of_pure_alcohol 평균을 구하기

이 평균값보다 적은 알코올을 섭취하는 대륙중에서 spirit을 가장 많이 마시는 국가를 구해보자

In [57]:
total_mean=drinks.total_litres_of_pure_alcohol.mean()
total_mean
Out[57]:
4.717098445595855
In [58]:
continent_sum=drinks.groupby('continent').spirit_servings.agg(['sum'])
continent_sum
Out[58]:
sum
continent
AF 866
AS 2677
EU 5965
OC 935
OT 3812
SA 1377
In [59]:
continent_sum['mean']=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
continent_sum
Out[59]:
sum mean
continent
AF 866 3.007547
AS 2677 2.170455
EU 5965 8.617778
OC 935 3.381250
OT 3812 5.995652
SA 1377 6.308333
In [60]:
continent_sum[continent_mean<total_mean]
Out[60]:
sum mean
continent
AF 866 3.007547
AS 2677 2.170455
OC 935 3.381250
In [61]:
continent_sum[continent_mean<total_mean].loc[:,'sum'].idxmax()
Out[61]:
'AS'

1)

In [62]:
a= drinks['continent'].str.contains('AS|AF|OC')
print(a)
0       True
1      False
2       True
3      False
4       True
       ...  
188    False
189     True
190     True
191     True
192     True
Name: continent, Length: 193, dtype: bool
In [63]:
b = drinks[a]
print(b)
country  beer_servings  spirit_servings  wine_servings  \
0    Afghanistan              0                0              0   
2        Algeria             25                0             14   
4         Angola            217               57             45   
8      Australia            261               72            212   
12       Bahrain             42               63              7   
..           ...            ...              ...            ...   
187      Vanuatu             21               18             11   
189      Vietnam            111                2              1   
190        Yemen              6                0              0   
191       Zambia             32               19              4   
192     Zimbabwe             64               18              4   

     total_litres_of_pure_alcohol continent  total_servings  alcohol_rate  
0                             0.0        AS               0      0.000000  
2                             0.7        AF              39      0.017949  
4                             5.9        AF             319      0.018495  
8                            10.4        OC             545      0.019083  
12                            2.0        AS             112      0.017857  
..                            ...       ...             ...           ...  
187                           0.9        OC              50      0.018000  
189                           2.0        AS             114      0.017544  
190                           0.1        AS               6      0.016667  
191                           2.5        AF              55      0.045455  
192                           4.7        AF              86      0.054651  

[113 rows x 8 columns]
In [64]:
spirit_continent_max = b[b['spirit_servings'] == b['spirit_servings'].max()]
spirit_continent_max
Out[64]:
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent total_servings alcohol_rate
141 Russian Federation 247 326 73 11.5 AS 646 0.017802

2)

In [65]:
a = drinks[drinks['continent'].isin(['AS', 'AF', 'OC'])]
a
Out[65]:
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent total_servings alcohol_rate
0 Afghanistan 0 0 0 0.0 AS 0 0.000000
2 Algeria 25 0 14 0.7 AF 39 0.017949
4 Angola 217 57 45 5.9 AF 319 0.018495
8 Australia 261 72 212 10.4 OC 545 0.019083
12 Bahrain 42 63 7 2.0 AS 112 0.017857
... ... ... ... ... ... ... ... ...
187 Vanuatu 21 18 11 0.9 OC 50 0.018000
189 Vietnam 111 2 1 2.0 AS 114 0.017544
190 Yemen 6 0 0 0.1 AS 6 0.016667
191 Zambia 32 19 4 2.5 AF 55 0.045455
192 Zimbabwe 64 18 4 4.7 AF 86 0.054651

113 rows × 8 columns

In [66]:
spirit_continent_max = a[a['spirit_servings'] == a['spirit_servings'].max()]
spirit_continent_max
Out[66]:
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent total_servings alcohol_rate
141 Russian Federation 247 326 73 11.5 AS 646 0.017802
In [67]:
drinks
Out[67]:
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent total_servings alcohol_rate
0 Afghanistan 0 0 0 0.0 AS 0 0.000000
1 Albania 89 132 54 4.9 EU 275 0.017818
2 Algeria 25 0 14 0.7 AF 39 0.017949
3 Andorra 245 138 312 12.4 EU 695 0.017842
4 Angola 217 57 45 5.9 AF 319 0.018495
... ... ... ... ... ... ... ... ...
188 Venezuela 333 100 3 7.7 SA 436 0.017661
189 Vietnam 111 2 1 2.0 AS 114 0.017544
190 Yemen 6 0 0 0.1 AS 6 0.016667
191 Zambia 32 19 4 2.5 AF 55 0.045455
192 Zimbabwe 64 18 4 4.7 AF 86 0.054651

193 rows × 8 columns

total_litres_of_pure_alcohol 평균을 구하기

이 평균값보다 적은 알코올을 섭취하는 대륙중에서 spirit을 가장 많이 마시는 국가를 구해보자

In [71]:
drinks
Out[71]:
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent total_servings alcohol_rate
0 Afghanistan 0 0 0 0.0 AS 0 0.000000
1 Albania 89 132 54 4.9 EU 275 0.017818
2 Algeria 25 0 14 0.7 AF 39 0.017949
3 Andorra 245 138 312 12.4 EU 695 0.017842
4 Angola 217 57 45 5.9 AF 319 0.018495
... ... ... ... ... ... ... ... ...
188 Venezuela 333 100 3 7.7 SA 436 0.017661
189 Vietnam 111 2 1 2.0 AS 114 0.017544
190 Yemen 6 0 0 0.1 AS 6 0.016667
191 Zambia 32 19 4 2.5 AF 55 0.045455
192 Zimbabwe 64 18 4 4.7 AF 86 0.054651

193 rows × 8 columns

In [77]:
total_mean=drinks.total_litres_of_pure_alcohol.mean()
total_mean
Out[77]:
4.717098445595855
In [78]:
continent_mean=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
continent_mean
Out[78]:
continent
AF    3.007547
AS    2.170455
EU    8.617778
OC    3.381250
OT    5.995652
SA    6.308333
Name: total_litres_of_pure_alcohol, dtype: float64
In [89]:
 continent_under_mean= continent_mean[continent_mean <= total_mean].index.tolist()
 continent_under_mean
Out[89]:
['AF', 'AS', 'OC']

A[A.column_name.isin(B)]

  • A데이터 프레임에서 A column_name 피처가 B안에 포함되어 있는 데이터만 가져옴
In [90]:
df_continent_under_mean= drinks.loc[drinks.continent.isin(continent_under_mean)] 
df_continent_under_mean
Out[90]:
country beer_servings spirit_servings wine_servings total_litres_of_pure_alcohol continent total_servings alcohol_rate
0 Afghanistan 0 0 0 0.0 AS 0 0.000000
2 Algeria 25 0 14 0.7 AF 39 0.017949
4 Angola 217 57 45 5.9 AF 319 0.018495
8 Australia 261 72 212 10.4 OC 545 0.019083
12 Bahrain 42 63 7 2.0 AS 112 0.017857
... ... ... ... ... ... ... ... ...
187 Vanuatu 21 18 11 0.9 OC 50 0.018000
189 Vietnam 111 2 1 2.0 AS 114 0.017544
190 Yemen 6 0 0 0.1 AS 6 0.016667
191 Zambia 32 19 4 2.5 AF 55 0.045455
192 Zimbabwe 64 18 4 4.7 AF 86 0.054651

113 rows × 8 columns

In [91]:
most_spirit_under_mean= df_continent_under_mean.loc[df_continent_under_mean['spirit_servings'].idxmax()]
most_spirit_under_mean
Out[91]:
country                         Russian Federation
beer_servings                                  247
spirit_servings                                326
wine_servings                                   73
total_litres_of_pure_alcohol                  11.5
continent                                       AS
total_servings                                 646
alcohol_rate                             0.0178019
Name: 141, dtype: object

+ Recent posts