In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
read_csv로 데이터를 Dataframe형태로 불러옴¶
In [2]:
file_path='data/drinks.csv'
drinks=pd.read_csv(file_path)
- info()함수를 통해 결측(NaN)값을 확인할 수 있음
In [3]:
print(drinks.info())
In [4]:
drinks.head(10) #디폴트 값은 5
Out[4]:
In [5]:
drinks.describe() # 숫자만 describe로 나옴
Out[5]:
In [6]:
corr=drinks[['beer_servings', 'wine_servings']].corr(method='pearson') #두 변수 X 와 Y 간의 선형 상관 관계를 계량화한 수치다
print(corr) # beer가 1일
In [7]:
cols=['beer_servings','spirit_servings','wine_servings','total_litres_of_pure_alcohol']
corr= drinks[cols].corr(method='pearson')
print(corr)
In [8]:
corr.values
Out[8]:
heatmap[heat(열)+map(지도)]: 데이터들의 배열을 색상으로 표현해주는 그래프
- heatmap을 사용하면 두 개의 카테고리 값에 대한 값 변화를 한눈에 알기 쉬움
- 대용량 데이터도 쉽게 표현 가능
In [9]:
import seaborn as sns
# corr 행렬 히트맵을 시각화합니다.
cols_view = ['beer', 'spirit', 'wine', 'alcohol'] # 그래프 출력을 위한 cols 이름을 축약합니다.
sns.set(font_scale=1.5)
hm = sns.heatmap(corr.values,
cbar=True, #컬러바 유무
annot=True, # 각 셀에 숫자 입력
fmt='.2f', #둘째자리까지
square=True,
annot_kws={'size': 15}, #숫자 크기 조절
yticklabels=cols_view,
xticklabels=cols_view)
plt.tight_layout() # 여백
plt.show()
In [10]:
# 시각화 라이브러리를 이용한 피처간의 scatter plot을 출력합니다.
sns.set(style='whitegrid', context='notebook')
sns.pairplot(drinks[['beer_servings', 'spirit_servings',
'wine_servings', 'total_litres_of_pure_alcohol']], height=2.5)
plt.show()
# 점 하나는 나라 하나
결측 데이터 전처리¶
In [11]:
print(drinks.isnull().sum())
print("----------------------------------------")
print(drinks.dtypes)
- 결측 데이터 채우기
In [12]:
drinks['continent']=drinks['continent'].fillna('OT') #OT=OTHER
In [13]:
print(drinks.isnull().sum())
In [14]:
print(drinks.info()) # 5번 193으로 달라짐
In [15]:
labels= drinks['continent'].value_counts().index.tolist() #index를 리스트로 만들고
fracs1=drinks['continent'].value_counts().values.tolist() #continent의 valuses를 리스트로
explode= (0,0,0,0.25,0,0)
plt.pie(fracs1,explode=explode, labels=labels, autopct='%.0f%%', shadow=True) #shadow=True 그림자 붙이기
plt.title("NULL Data to 'OT'")
plt.show()
In [16]:
labels= drinks['continent'].value_counts().index.tolist()
print(labels)
In [17]:
fracs1=drinks['continent'].value_counts().values.tolist()
print(fracs1)
In [18]:
#plt.rcParams.update({'font.size': 15}) #(1)
plt.figure(figsize=(5,5)) #그림크기
#plt.rc('font',size=15) #폰트크기 (2)
explode= (0,0,0,0.2,0,0)
plt.pie(fracs1, #데이터값,values,
explode=explode, #파이에서 튀어나오는 위치
labels=labels,
colors=None,
autopct='%.1f%%',
pctdistance=0.6,
shadow=False,
labeldistance=1.1,
startangle=230, # 90넣으면 반시계방향으로 90도 돔
radius=None,
counterclock=True, #반시계방향, False하면 위치바뀜
wedgeprops=None,
textprops=None,
center=(0, 0),
frame=False,
rotatelabels=False)
plt.title("NULL Data to 'OT'")
plt.show()
#plt.rcParams.update({'font.size': 15}) (1)
plt.rc('font',size=10) #폰트 크기를 원래대로 지정 (2)
In [19]:
result=drinks.groupby('continent').spirit_servings.agg(['mean','min','max','sum'])
result.head(10)
Out[19]:
In [20]:
result=drinks.groupby('continent').spirit_servings. \
agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)
Out[20]:
In [21]:
result=drinks.groupby('continent').beer_servings. \
agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)
Out[21]:
In [22]:
result=drinks.groupby('continent').wine_servings. \
agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)
Out[22]:
In [23]:
result=drinks.groupby('continent').total_litres_of_pure_alcohol. \
agg(['count','mean','min','max','sum','var','std']) #agg함수: 여러 개 적용할 때 사용/ var:분산, std:표준편차
result.head(10)
Out[23]:
total_litres_of_pure_alcohol의 전체평균
In [24]:
total_mean=drinks.total_litres_of_pure_alcohol.mean()
total_mean
Out[24]:
In [25]:
continent_mean=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
print(continent_mean)
In [26]:
continent_over_mean=continent_mean[continent_mean >=total_mean]
print(continent_over_mean)
- 평균 beer_servings가 가장 높은 대륙
In [27]:
beer_continent= drinks.groupby('continent').beer_servings.mean()
beer_continent
Out[27]:
In [28]:
beer_continent_max=beer_continent[beer_continent==beer_continent.max()]
beer_continent_max
Out[28]:
In [29]:
beer_continent.idxmax()
Out[29]:
In [30]:
beer_continent[beer_continent.idxmax()]
Out[30]:
- 평균 beer_servings가 가장 낮은 대륙
In [31]:
beer_continent_min=beer_continent[beer_continent==beer_continent.min()]
beer_continent_min
Out[31]:
In [32]:
beer_continent.idxmin()
Out[32]:
In [33]:
beer_continent[beer_continent.idxmin()]
Out[33]:
시각화¶
- 대륙별 spirit_servings의 평균,최소,최대를 시각화
In [34]:
result=drinks.groupby('continent').spirit_servings. \
agg(['mean','min','max','count'])
result.head(10)
Out[34]:
In [35]:
number_groups=len(result.index)
number_groups
Out[35]:
In [36]:
means= result['mean'].tolist()
means
Out[36]:
In [37]:
mins= result['min'].tolist()
maxs= result['max'].tolist()
counts= result['count'].tolist()
index=np.arange(number_groups)
plt.figure(figsize=(10,5)) #그림크기
bar_width= 0.2 #0.8 디폴트
rects_1= plt.bar(index, means, bar_width, color='r', label='Mean')
rects_2= plt.bar(index+bar_width, mins, bar_width, color='g', label='Min')
rects_3= plt.bar(index+bar_width*2, maxs, bar_width, color='b', label='Max')
rects_4= plt.bar(index+bar_width*3, counts, bar_width, color='y', label='Count')
plt.xticks(index, result.index.tolist())
plt.legend()
plt.show()
- 대륙별 total_litres_of_pure_alcohol 시각화
In [38]:
continent_mean=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
continent_mean
Out[38]:
In [39]:
continents= continent_mean.index.tolist()
continents
Out[39]:
In [40]:
continents.append('MEAN')
continents
Out[40]:
In [41]:
x_pos=np.arange(len(continents)) #그래프 세로
x_pos
Out[41]:
In [42]:
alcohol= continent_mean.tolist()
alcohol.append(total_mean)
alcohol
Out[42]:
In [43]:
total_mean
Out[43]:
In [44]:
#plt.grid(b=True, which='both', axis='both')
bar_list = plt.bar(x_pos, alcohol, align='center', alpha=0.5)
bar_list[len(continents) - 1].set_color('r') # 맨 마지막을 빨간색으로
plt.plot([0., 6], [total_mean, total_mean], "k--") #점선 그리기
plt.xticks(x_pos, continents)
plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by Continent')
plt.show()
In [45]:
#plt.grid(b=True, which='both', axis='both')
bar_list = plt.bar(x_pos, alcohol, align='center', alpha=0.5)
bar_list[len(continents) - 1].set_color('g')
plt.plot([0., 6], [total_mean, total_mean], "k--") #점선 그리기
plt.xticks(x_pos, continents)
plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by Continent')
plt.show()
In [46]:
continent_mean
Out[46]:
In [47]:
#plt.grid(b=True, which='both', axis='both')
bar_list = plt.bar(x_pos, alcohol, align='center', alpha=0.5)
bar_list[len(continents) - 1].set_color('g')
plt.plot([3., 5], [continent_mean['SA'], continent_mean['SA']], "k--") #점선 그리기
plt.xticks(x_pos, continents)
plt.ylabel('total_litres_of_pure_alcohol')
plt.title('total_litres_of_pure_alcohol by Continent')
plt.show()
In [48]:
!pip install scipy
In [49]:
africa=drinks.loc[drinks['continent']=='AF']
africa.head()
Out[49]:
In [50]:
europe=drinks.loc[drinks['continent']=='EU']
europe.head()
Out[50]:
In [51]:
drinks.head()
Out[51]:
In [52]:
drinks['total_servings']=drinks['beer_servings']+ drinks['wine_servings']+ drinks['spirit_servings']
In [53]:
drinks['alcohol_rate']= drinks['total_litres_of_pure_alcohol'] / drinks['total_servings']
drinks['alcohol_rate']= drinks['alcohol_rate'].fillna(0)
In [54]:
drinks.head()
Out[54]:
In [55]:
country_rank= drinks[['country','alcohol_rate']]
country_rank= country_rank.sort_values(by=['alcohol_rate'], ascending= 0)
country_rank.head()
Out[55]:
In [56]:
country_list = country_rank.country.tolist()
x_pos = np.arange(len(country_list))
rank = country_rank.alcohol_rate.tolist()
bar_list = plt.bar(x_pos, rank)
bar_list[country_list.index("South Korea")].set_color('r')
plt.ylabel('alcohol rate')
plt.title('liquor drink rank by contry')
plt.axis([0, 200, 0, 0.3])
korea_rank = country_list.index("South Korea")
korea_alc_rate = country_rank[country_rank['country'] == 'South Korea']['alcohol_rate'].values[0]
plt.annotate('South Korea : ' + str(korea_rank + 1),
xy=(korea_rank, korea_alc_rate),
xytext=(korea_rank + 10, korea_alc_rate + 0.05),
arrowprops=dict(facecolor='red', shrink=0.05))
plt.show()
In [57]:
total_mean=drinks.total_litres_of_pure_alcohol.mean()
total_mean
Out[57]:
In [58]:
continent_sum=drinks.groupby('continent').spirit_servings.agg(['sum'])
continent_sum
Out[58]:
In [59]:
continent_sum['mean']=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
continent_sum
Out[59]:
In [60]:
continent_sum[continent_mean<total_mean]
Out[60]:
In [61]:
continent_sum[continent_mean<total_mean].loc[:,'sum'].idxmax()
Out[61]:
1)¶
In [62]:
a= drinks['continent'].str.contains('AS|AF|OC')
print(a)
In [63]:
b = drinks[a]
print(b)
In [64]:
spirit_continent_max = b[b['spirit_servings'] == b['spirit_servings'].max()]
spirit_continent_max
Out[64]:
2)¶
In [65]:
a = drinks[drinks['continent'].isin(['AS', 'AF', 'OC'])]
a
Out[65]:
In [66]:
spirit_continent_max = a[a['spirit_servings'] == a['spirit_servings'].max()]
spirit_continent_max
Out[66]:
In [67]:
drinks
Out[67]:
In [71]:
drinks
Out[71]:
In [77]:
total_mean=drinks.total_litres_of_pure_alcohol.mean()
total_mean
Out[77]:
In [78]:
continent_mean=drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean()
continent_mean
Out[78]:
In [89]:
continent_under_mean= continent_mean[continent_mean <= total_mean].index.tolist()
continent_under_mean
Out[89]:
A[A.column_name.isin(B)]¶
- A데이터 프레임에서 A column_name 피처가 B안에 포함되어 있는 데이터만 가져옴
In [90]:
df_continent_under_mean= drinks.loc[drinks.continent.isin(continent_under_mean)]
df_continent_under_mean
Out[90]:
In [91]:
most_spirit_under_mean= df_continent_under_mean.loc[df_continent_under_mean['spirit_servings'].idxmax()]
most_spirit_under_mean
Out[91]:
'Python' 카테고리의 다른 글
Python_우리나라 아기이름 데이터 분석 (1) | 2020.09.15 |
---|---|
Python_데이터분석2 (0) | 2020.09.15 |
Python_example (0) | 2020.09.11 |
python_pandas(판다스): 계층적 색인 지정, 누락된 데이터처리, 결측치채우기, 데이터 변형하기, onehot인코딩 (0) | 2020.09.11 |
Python_pandas 문제 (0) | 2020.09.09 |