Python_09102
In [1]:
import pandas as pd 
In [2]:
df= pd.read_csv('data/titanic_train.csv', sep=',')
In [3]:
df.head()
Out[3]:
pclass survived name sex age sibsp parch ticket fare cabin embarked body home.dest
0 2 1 Mellinger, Miss. Madeleine Violet female 13.0 0 1 250644 19.5000 NaN S NaN England / Bennington, VT
1 2 1 Wells, Miss. Joan female 4.0 1 1 29103 23.0000 NaN S NaN Cornwall / Akron, OH
2 2 1 Duran y More, Miss. Florentina female 30.0 1 0 SC/PARIS 2148 13.8583 NaN C NaN Barcelona, Spain / Havana, Cuba
3 3 0 Scanlan, Mr. James male NaN 0 0 36209 7.7250 NaN Q NaN NaN
4 3 1 Bradley, Miss. Bridget Delia female 22.0 0 0 334914 7.7250 NaN Q NaN Kingwilliamstown, Co Cork, Ireland Glens Falls...
In [4]:
df1=pd.read_csv('data/ex1.csv')
In [5]:
df1.head()
Out[5]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [6]:
f = open("data/ex1.csv", 'w')
data = """a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo"""
f.write(data)
f.close()
In [7]:
f = open("data/ex2.csv", 'w')
data = """
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
"""
f.write(data)
f.close()

df2= pd.read_csv('data/ex2.csv', header=None)
df2
Out[7]:
0 1 2 3 4
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [8]:
df2= pd.read_csv('data/ex2.csv', names=['a','b','c','d','message'])
df2
Out[8]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [9]:
names=['a','b','c','d','message']
df2= pd.read_csv('data/ex2.csv', names=names, index_col='message') # message가 index의 column으로 감
df2
Out[9]:
a b c d
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
In [10]:
f = open("data/csv_mindex.csv", 'w')
data = """key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16
"""
f.write(data)
f.close()
  • 계층적 색인 지정
In [11]:
parsed= pd.read_csv('data/csv_mindex.csv', index_col=['key1', 'key2'])
parsed
Out[11]:
value1 value2
key1 key2
one a 1 2
b 3 4
c 5 6
d 7 8
two a 9 10
b 11 12
c 13 14
d 15 16
In [12]:
f = open("data/ex3.txt", 'w')
data = """A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871859 -0.348382  1.100491
"""
f.write(data)
f.close()
In [13]:
result= pd.read_table('data/ex3.txt', sep='\s+') #\s+ :공백이 하나 이상 발생한 경우 구분
result
Out[13]:
A B C
aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871859 -0.348382 1.100491
In [14]:
f = open("data/ex4.csv", 'w')
data = """#Hey!
a,b,c,d,message
#by python
#csv file
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo"""
f.write(data)
f.close()
In [15]:
f = open("data/ex4.csv",'r')
skiplist = []
count=0
while True:
    line = f.readline()
    if not line:
        break
    if '#' in line :
        print(line)
        skiplist.append(count)
    count += 1
f.close()
skiplist
#Hey!

#by python

#csv file

Out[15]:
[0, 2, 3]
In [16]:
f=open('data/ex4.csv', 'r', encoding='utf-8')
lineNum=0
skiplist=[]

while True:
    lines=f.readline()
    if not lines:
        break
    if '#' in lines:
        skiplist.append(lineNum)
    lineNum +=1 
f.close()
skiplist
Out[16]:
[0, 2, 3]
In [17]:
df1= pd.read_csv('data/ex4.csv', skiprows=[0,2,3]) # 불필요한 줄 지움 
df1 
Out[17]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
In [18]:
df1= pd.read_csv('data/ex4.csv', skiprows= skiplist) # 불필요한 줄 지움 
df2
Out[18]:
a b c d
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
In [19]:
f = open("data/ex5.csv", 'w')
data = """something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo
"""
f.write(data)
f.close()
In [20]:
result= pd.read_csv('data/ex5.csv') 
result
Out[20]:
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
In [21]:
pd.isnull(result)
Out[21]:
something a b c d message
0 False False False False False True
1 False False False True False False
2 False False False False False False
In [22]:
result=pd.read_csv('data/ex5.csv', na_values=['NULL']) # na_values: 특정한 값을 NaN으로 취급하고 싶을 때 사용, 1 넣으면 1이 NaN으로 바뀜 
result
Out[22]:
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
In [23]:
sentinels={'message': ['foo','world'], 'something': ['two']}
result=pd.read_csv('data/ex5.csv', na_values=sentinels)
result
Out[23]:
something a b c d message
0 one 1 2 3.0 4 NaN
1 NaN 5 6 NaN 8 NaN
2 three 9 10 11.0 12 NaN

엑셀파일읽기

In [24]:
xlsx = 'data/ex01.xlsx'
frame= pd.read_excel(xlsx, 'Sheet1')
frame
Out[24]:
a b c d message
0 0 1 2 3 hello
1 4 5 6 7 world
2 9 10 11 12 foo

엑셀파일저장

In [25]:
writer= pd.ExcelWriter('data/ex02.xlsx')
frame.to_excel(writer, 'Sheet1')
writer.save()

누락된 데이터 처리

인자 설명
dropna 누락된 데이터가 있는 축(low,column)을 제외시킨다. 어느정도의 누락데이터까지 용인할것인지 지정 할 수 있다.
fillna 누락된 데이터를 대신할 값을 채우거나 'ffill'이나 'bfill'같은 보간 메서드를 적용한다.
isnull 누락되거나 NA인 값을 알려주는 불리언 값이 저장된 같은 형의 객체를 반환
notnull isnull과 반대되는 메서드
In [24]:
import numpy as np 
import pandas as pd 
In [25]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
Out[25]:
0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
In [26]:
string_data.isnull()
Out[26]:
0    False
1    False
2     True
3    False
dtype: bool
In [27]:
string_data[0]= None 
string_data.isnull()
Out[27]:
0     True
1    False
2     True
3    False
dtype: bool
In [28]:
from numpy import nan as NA 
data= pd.Series([1,NA,3.5,NA,7])
data.dropna()
Out[28]:
0    1.0
2    3.5
4    7.0
dtype: float64
In [29]:
data[data.notnull()] #data.dropna()와 같은 결과 
Out[29]:
0    1.0
2    3.5
4    7.0
dtype: float64
In [30]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data
Out[30]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
In [31]:
cleand=data.dropna() #NaN이 하나라도 들어있으면 사라짐 
cleand
Out[31]:
0 1 2
0 1.0 6.5 3.0
In [32]:
cleand= data.dropna(how="all") # how=all : 모두 NaN이면 삭제됨
cleand
Out[32]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0
In [33]:
# 4번 열을 추가하고 NA 값으로 지정 
data[4]=NA
data
Out[33]:
0 1 2 4
0 1.0 6.5 3.0 NaN
1 1.0 NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN 6.5 3.0 NaN
In [34]:
cleand= data.dropna(axis='columns', how="all")
#cleand= data.dropna(axis='1', how="all") 위의 코드와 동일 
cleand
Out[34]:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
In [35]:
df= pd.DataFrame(np.random.randn(7,3)) #세로가 7개 가로가 3개인
df
Out[35]:
0 1 2
0 -0.131701 0.165730 -1.445157
1 -0.916618 1.702881 0.635776
2 0.591533 0.112397 -0.523709
3 1.330796 -0.356226 -0.715347
4 0.808900 -0.128411 0.492950
5 -0.303647 -1.336609 -1.564481
6 0.328623 -0.499420 0.384282
In [36]:
df.iloc[0:3,1]=NA
df.iloc[:1,2]=NA
df
Out[36]:
0 1 2
0 -0.131701 NaN NaN
1 -0.916618 NaN 0.635776
2 0.591533 NaN -0.523709
3 1.330796 -0.356226 -0.715347
4 0.808900 -0.128411 0.492950
5 -0.303647 -1.336609 -1.564481
6 0.328623 -0.499420 0.384282
In [37]:
cleaned= df.dropna(thresh=2) # 2개이상 NaN이 있으면 삭제됨  
cleaned 
Out[37]:
0 1 2
1 -0.916618 NaN 0.635776
2 0.591533 NaN -0.523709
3 1.330796 -0.356226 -0.715347
4 0.808900 -0.128411 0.492950
5 -0.303647 -1.336609 -1.564481
6 0.328623 -0.499420 0.384282

결측치 채우기

In [38]:
df
Out[38]:
0 1 2
0 -0.131701 NaN NaN
1 -0.916618 NaN 0.635776
2 0.591533 NaN -0.523709
3 1.330796 -0.356226 -0.715347
4 0.808900 -0.128411 0.492950
5 -0.303647 -1.336609 -1.564481
6 0.328623 -0.499420 0.384282
In [39]:
filled=df.fillna(0) # NaN을 0으로 대체 
filled
Out[39]:
0 1 2
0 -0.131701 0.000000 0.000000
1 -0.916618 0.000000 0.635776
2 0.591533 0.000000 -0.523709
3 1.330796 -0.356226 -0.715347
4 0.808900 -0.128411 0.492950
5 -0.303647 -1.336609 -1.564481
6 0.328623 -0.499420 0.384282
In [40]:
filled2=df.fillna({1:0.9, 2:0})
filled2
Out[40]:
0 1 2
0 -0.131701 0.900000 0.000000
1 -0.916618 0.900000 0.635776
2 0.591533 0.900000 -0.523709
3 1.330796 -0.356226 -0.715347
4 0.808900 -0.128411 0.492950
5 -0.303647 -1.336609 -1.564481
6 0.328623 -0.499420 0.384282
In [41]:
df.fillna(0, inplace=False) #원본데이터 안바뀜
df
Out[41]:
0 1 2
0 -0.131701 NaN NaN
1 -0.916618 NaN 0.635776
2 0.591533 NaN -0.523709
3 1.330796 -0.356226 -0.715347
4 0.808900 -0.128411 0.492950
5 -0.303647 -1.336609 -1.564481
6 0.328623 -0.499420 0.384282
In [42]:
df.fillna(0, inplace=True) #원본데이터 0으로 바뀜 
df
Out[42]:
0 1 2
0 -0.131701 0.000000 0.000000
1 -0.916618 0.000000 0.635776
2 0.591533 0.000000 -0.523709
3 1.330796 -0.356226 -0.715347
4 0.808900 -0.128411 0.492950
5 -0.303647 -1.336609 -1.564481
6 0.328623 -0.499420 0.384282
In [43]:
df= pd.DataFrame(np.random.randn(7,3))
df.iloc[2:,1]=NA
df.iloc[4:,2]=NA
df
Out[43]:
0 1 2
0 -0.924464 -0.377605 -1.641186
1 -1.683392 -0.488751 -0.973592
2 1.298459 NaN -0.766517
3 0.340175 NaN -0.209967
4 -0.890470 NaN NaN
5 0.125698 NaN NaN
6 1.980833 NaN NaN
In [44]:
filled= df.fillna(method='ffill')
filled
Out[44]:
0 1 2
0 -0.924464 -0.377605 -1.641186
1 -1.683392 -0.488751 -0.973592
2 1.298459 -0.488751 -0.766517
3 0.340175 -0.488751 -0.209967
4 -0.890470 -0.488751 -0.209967
5 0.125698 -0.488751 -0.209967
6 1.980833 -0.488751 -0.209967
In [45]:
filled= df.fillna(method='ffill', limit=2) #2개만 채움
filled
Out[45]:
0 1 2
0 -0.924464 -0.377605 -1.641186
1 -1.683392 -0.488751 -0.973592
2 1.298459 -0.488751 -0.766517
3 0.340175 -0.488751 -0.209967
4 -0.890470 NaN -0.209967
5 0.125698 NaN -0.209967
6 1.980833 NaN NaN

def.fillna(value=, method='ffill', axis=0, inplace=, limit=)

  • value : 비어있는 값을 채울 스칼라 값이나 dictonary 형식의 객체
  • method : 보간법(기본=ffill)
  • axis : 값을 채워넣을 축(기본 axis=0)
  • inplace : 복사본을 생성하지 않고 호출한 객체에 값을 반환(기본값=False )
  • limit : 값을 앞 또는 뒤로 몇개까지 채울지 지정
In [46]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data
Out[46]:
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
6 two 4
In [47]:
data.duplicated() # 중복이 되는 것 True
Out[47]:
0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool
In [48]:
data.drop_duplicates() # 중복된 것 사라짐 
Out[48]:
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
In [49]:
data['v1']=range(7)
data
Out[49]:
k1 k2 v1
0 one 1 0
1 two 1 1
2 one 2 2
3 two 3 3
4 one 3 4
5 two 4 5
6 two 4 6
In [50]:
data.drop_duplicates(['k1']) #k1의 중복요소 삭제  
Out[50]:
k1 k2 v1
0 one 1 0
1 two 1 1
In [51]:
data.drop_duplicates(['k2'])#k2의 중복요소 삭제
Out[51]:
k1 k2 v1
0 one 1 0
2 one 2 2
3 two 3 3
5 two 4 5
In [52]:
data.drop_duplicates(['v1']) # 중복이 없어서 똑같이 나옴 
Out[52]:
k1 k2 v1
0 one 1 0
1 two 1 1
2 one 2 2
3 two 3 3
4 one 3 4
5 two 4 5
6 two 4 6
In [53]:
data
Out[53]:
k1 k2 v1
0 one 1 0
1 two 1 1
2 one 2 2
3 two 3 3
4 one 3 4
5 two 4 5
6 two 4 6
In [54]:
data.drop_duplicates(['k1','k2'], keep='last') # keep='last' : 마지막이 살아남음
# k1과 k2의 값이 같은 건 two 4 5/ two 4 6 이 중 마지막이 살아남음  
Out[54]:
k1 k2 v1
0 one 1 0
1 two 1 1
2 one 2 2
3 two 3 3
4 one 3 4
6 two 4 6

데이터 변형하기

In [55]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
Out[55]:
food ounces
0 bacon 4.0
1 pulled pork 3.0
2 bacon 12.0
3 Pastrami 6.0
4 corned beef 7.5
5 Bacon 8.0
6 pastrami 3.0
7 honey ham 5.0
8 nova lox 6.0
In [56]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
meat_to_animal
Out[56]:
{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}
In [57]:
#소문자로 바꾸기 
lower_cased=data['food'].str.lower()
lower_cased
Out[57]:
0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object
In [58]:
data['animal']= lower_cased.map(meat_to_animal)
data
Out[58]:
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
In [60]:
#lower_cased=data['food'].str.lower()
#data['animal']= lower_cased.map(meat_to_animal)
data['animal']= data['food'].map(lambda x: meat_to_animal[x.lower()])
data
Out[60]:
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
In [62]:
data=pd.Series([1., -999., 2., -999., -1000., 3.])
data
Out[62]:
0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64
In [63]:
data2 = data.replace(-999, np.nan)
data2
Out[63]:
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64
In [64]:
data2 = data.replace([-999, -1000], np.nan)
data2
Out[64]:
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64
In [65]:
data2 = data.replace([-999, -1000], [np.nan, 0])
data2
Out[65]:
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
In [66]:
data2= data.replace({-999.:np.nan, -1000.:0})
#-999 ->Nan , -1000 ->0
data2
Out[66]:
0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64
In [79]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data
Out[79]:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11
In [80]:
transform= lambda x:x[:4].upper() #X를 4자리로 끊음 
data.index.map(transform)
Out[80]:
Index(['OHIO', 'COLO', 'NEW '], dtype='object')
In [81]:
data.index
Out[81]:
Index(['Ohio', 'Colorado', 'New York'], dtype='object')
In [82]:
data.index= data.index.map(transform)
data
Out[82]:
one two three four
OHIO 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11
  • str.title : 단어의 시작 부분에 있는 문자는 대문자로, 나머지는 모두 소문자로 만든다.
In [83]:
data.rename(index=str.title, columns=str.upper)
Out[83]:
ONE TWO THREE FOUR
Ohio 0 1 2 3
Colo 4 5 6 7
New 8 9 10 11
In [84]:
data
Out[84]:
one two three four
OHIO 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11
In [85]:
data.rename(index={'OHIO':'INDIANA'}, columns={'three':'peekaboo'})
Out[85]:
one two peekaboo four
INDIANA 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11
In [86]:
data
Out[86]:
one two three four
OHIO 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11
In [90]:
data.rename(index={'OHIO':'INDIANA'}, inplace=True) # 기존 객체를 수정 
data
Out[90]:
one two three four
INDIANA 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11
In [91]:
data
Out[91]:
one two three four
INDIANA 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11
  • 나이 나누기
    18~25(0)
    26~35(1)
    35~60(2)
    60이상(3)
In [94]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] 
In [95]:
bins=[18,25,35,60,100] # 각 구간을 나눠줄 숫자값 
cats= pd.cut(ages,bins) #pd.cut(카테고리화 할 숫자데이터, 자를 구간의 구분값)
cats
Out[95]:
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
In [96]:
cats.codes #각 성분이 몇번쨰 구간에 속해있는지 정수 index로 표시됨 
Out[96]:
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
In [172]:
cats.categories # ( ] : 왼쪽 미포함, 오른쪽 포함
Out[172]:
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')
In [97]:
pd.value_counts(cats) # 각 구간의 성분의 개수 
Out[97]:
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
In [98]:
pd.cut(ages,[18,25,35,60,100], right=False) # [ ) : 왼쪽 포함, 오른쪽 미포함
Out[98]:
[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]
In [99]:
pd.cut(ages,[18,25,35,60,100], right=True) # 위와 반대 
Out[99]:
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
In [100]:
group_names= ['Youth', 'youngAdult', 'MiddleAged', 'Senior']
In [101]:
data2= pd.cut(ages,bins,labels=group_names)
data2
Out[101]:
[Youth, Youth, Youth, youngAdult, Youth, ..., youngAdult, Senior, MiddleAged, MiddleAged, youngAdult]
Length: 12
Categories (4, object): [Youth < youngAdult < MiddleAged < Senior]
In [102]:
data2.value_counts()
Out[102]:
Youth         5
youngAdult    3
MiddleAged    3
Senior        1
dtype: int64

각 구간 구분값을 bin으로 정의해서 나누었는데 pandas에서 알아서 판단하여 데이터의 길이를 잘라주고 구간을 설정할 수도 있음.

In [105]:
data= np.random.rand(20)
data
Out[105]:
array([0.7166629 , 0.27019459, 0.74586198, 0.46917089, 0.45317755,
       0.10626844, 0.08305884, 0.57318399, 0.5936407 , 0.78475135,
       0.33795658, 0.99624303, 0.2733613 , 0.89804862, 0.10861183,
       0.43367864, 0.09130907, 0.8281011 , 0.54757824, 0.0145025 ])

데이터 성분값을 기준으로 자동으로 구간을 나누게 하기 위해서 나눌 구간의 개수만 입력해준다 (성분의 최소값~최대값을 보고 4개구간 나눔)

In [106]:
pd.cut(data, 4, precision=2) #precision=2 소수점 2자리까지 표현 
Out[106]:
[(0.51, 0.75], (0.26, 0.51], (0.51, 0.75], (0.26, 0.51], (0.26, 0.51], ..., (0.26, 0.51], (0.014, 0.26], (0.75, 1.0], (0.51, 0.75], (0.014, 0.26]]
Length: 20
Categories (4, interval[float64]): [(0.014, 0.26] < (0.26, 0.51] < (0.51, 0.75] < (0.75, 1.0]]
In [107]:
data= pd.DataFrame(np.random.randn(1000,4)) #n붙이면 정규분포.. 
data
Out[107]:
0 1 2 3
0 0.651133 1.584606 2.304293 -0.585606
1 3.000024 -0.510052 -0.256742 1.269666
2 -1.460685 0.493399 -0.285212 0.711587
3 -0.900536 0.580821 1.830050 -0.017385
4 -0.526212 -1.039466 0.856561 -0.704050
... ... ... ... ...
995 1.224748 -0.389117 1.182251 -1.544449
996 -0.665040 -0.029167 0.611918 -0.382876
997 -0.134677 -1.843704 2.086748 0.926602
998 -0.528677 0.842454 0.240342 0.484023
999 -0.501713 0.430430 0.889384 -0.380317

1000 rows × 4 columns

In [108]:
data.describe()
Out[108]:
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.015997 -0.023723 -0.011136 0.019692
std 0.968457 1.046275 0.982243 1.004339
min -3.738030 -4.591808 -3.460535 -3.228758
25% -0.700473 -0.730756 -0.621459 -0.667099
50% -0.006314 -0.056220 0.014056 -0.003606
75% 0.650398 0.670439 0.640740 0.713520
max 3.096388 2.836078 2.779539 2.658785
In [109]:
data[2] #column 2 
Out[109]:
0      2.304293
1     -0.256742
2     -0.285212
3      1.830050
4      0.856561
         ...   
995    1.182251
996    0.611918
997    2.086748
998    0.240342
999    0.889384
Name: 2, Length: 1000, dtype: float64
In [111]:
col= data[2]
col[np.abs(col)>3]
Out[111]:
402   -3.170485
519   -3.460535
Name: 2, dtype: float64
In [112]:
data[(np.abs(data)>3).any(1)] #data가 절대값이 3보다 큰게 하나라도 있으면 무조건 출력
Out[112]:
0 1 2 3
1 3.000024 -0.510052 -0.256742 1.269666
70 -3.738030 0.695613 -0.065118 -0.201237
279 3.007725 -0.548910 1.084639 0.643492
401 3.096388 -4.591808 -0.394104 -0.362914
402 1.227380 1.542498 -3.170485 1.745196
519 -1.471528 1.880856 -3.460535 -0.681035
984 -0.485900 0.986504 0.394333 -3.228758
  • np.sign(x) : x<0일 때 -1, x==0일 떄 0 ,x>0일 때 1을 반환
In [120]:
data[np.abs(data)>3]= np.sign(data)*3 # 절대값이 3보다 큰 값들을 -1 또는 1로 나타내는데 *3을 해서 3 또는 -3으로 나타나짐
data.describe()
Out[120]:
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.015363 -0.022131 -0.010505 0.019920
std 0.965566 1.040513 0.980196 1.003624
min -3.000000 -3.000000 -3.000000 -3.000000
25% -0.700473 -0.730756 -0.621459 -0.667099
50% -0.006314 -0.056220 0.014056 -0.003606
75% 0.650398 0.670439 0.640740 0.713520
max 3.000000 2.836078 2.779539 2.658785
In [122]:
data.head()
Out[122]:
0 1 2 3
0 0.651133 1.584606 2.304293 -0.585606
1 3.000000 -0.510052 -0.256742 1.269666
2 -1.460685 0.493399 -0.285212 0.711587
3 -0.900536 0.580821 1.830050 -0.017385
4 -0.526212 -1.039466 0.856561 -0.704050
In [121]:
np.sign(data).head() # 부호를 나타냄 
Out[121]:
0 1 2 3
0 1.0 1.0 1.0 -1.0
1 1.0 -1.0 -1.0 1.0
2 -1.0 1.0 -1.0 1.0
3 -1.0 1.0 1.0 -1.0
4 -1.0 -1.0 1.0 -1.0
In [136]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df
Out[136]:
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
4 16 17 18 19
In [137]:
sampler = np.random.permutation(5) # 무작위로 섞인 배열 만든다
sampler
Out[137]:
array([4, 1, 3, 2, 0])
In [140]:
df.take(sampler) #sampler 의 값을 인덱스로
Out[140]:
0 1 2 3
4 16 17 18 19
1 4 5 6 7
3 12 13 14 15
2 8 9 10 11
0 0 1 2 3
In [147]:
df.sample(n=3) # 3개의 인덱스 랜덤하게 추출
Out[147]:
0 1 2 3
3 12 13 14 15
0 0 1 2 3
4 16 17 18 19
In [151]:
choices = pd.Series([5, 7, -1, 6, 4])
choices
Out[151]:
0    5
1    7
2   -1
3    6
4    4
dtype: int64
In [152]:
draws = choices.sample(n=10, replace=True)
draws
Out[152]:
4    4
4    4
0    5
1    7
1    7
0    5
1    7
1    7
3    6
1    7
dtype: int64

onehot 인코딩

  • 문자를 숫자로 바꾸어 주는 방법 중 하나로 onehot인코딩이 있다.
  • 가변수로 만들어주는 것인데, 이는 0과 1로 이루어진 열을 나타낸다.
  • 1은 있다 0은 없다를 나타낸다.
In [189]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df
Out[189]:
key data1
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 b 5
In [190]:
pd.get_dummies(df['key']) #df['key]열만 get_dummies해줌 
Out[190]:
a b c
0 0 1 0
1 0 1 0
2 1 0 0
3 0 0 1
4 1 0 0
5 0 1 0
In [191]:
dummies = pd.get_dummies(df['key'])
dummies
Out[191]:
a b c
0 0 1 0
1 0 1 0
2 1 0 0
3 0 0 1
4 1 0 0
5 0 1 0
In [208]:
dummies = pd.get_dummies(df['key'],prefix='key') # prefix를 활용하여 좀 더 명시적으로 표현 
#기존 df의 컬럼을 반영해주기 위해서 작성
dummies
Out[208]:
key_a key_b key_c
0 0 1 0
1 0 1 0
2 1 0 0
3 0 0 1
4 1 0 0
5 0 1 0
In [193]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
Out[193]:
data1 key_a key_b key_c
0 0 0 1 0
1 1 0 1 0
2 2 1 0 0
3 3 0 0 1
4 4 1 0 0
5 5 0 1 0
In [210]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None, names=mnames) 
movies[:10]
Out[210]:
movie_id title genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy
5 6 Heat (1995) Action|Crime|Thriller
6 7 Sabrina (1995) Comedy|Romance
7 8 Tom and Huck (1995) Adventure|Children's
8 9 Sudden Death (1995) Action
9 10 GoldenEye (1995) Action|Adventure|Thriller
In [211]:
all_genres=[]
for x in movies.genres:
    all_genres.extend(x.split('|'))
genres=pd.unique(all_genres)
genres
# genres의 장르를 중복없이 나오게 하기
Out[211]:
array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)
In [212]:
zero_matrix=np.zeros((len(movies), len(genres)))
dummies= pd.DataFrame(zero_matrix, columns=genres)
dummies
Out[212]:
Animation Children's Comedy Adventure Fantasy Romance Drama Action Crime Thriller Horror Sci-Fi Documentary War Musical Mystery Film-Noir Western
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3878 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3879 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3880 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3881 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3882 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3883 rows × 18 columns

In [213]:
gen= movies.genres[0]
gen.split('|')
dummies.columns.get_indexer(gen.split('|'))
Out[213]:
array([0, 1, 2], dtype=int64)
In [214]:
dummies
Out[214]:
Animation Children's Comedy Adventure Fantasy Romance Drama Action Crime Thriller Horror Sci-Fi Documentary War Musical Mystery Film-Noir Western
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3878 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3879 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3880 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3881 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3882 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3883 rows × 18 columns

  • enumerate
    : 보통 for문과 함께 사용
    : 인덱스 값을 포함하는 enumerate객체를 리턴
    : for문처럼 반복되는 구간에서 객체가 현재 어느 위치에 있는지 알려주는 인덱스 값이 필요할 때 사용하면 유용
In [215]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|')) #gen 첫번째 줄에 나와있는 string 
    dummies.iloc[i, indices] = 1 

dummies
Out[215]:
Animation Children's Comedy Adventure Fantasy Romance Drama Action Crime Thriller Horror Sci-Fi Documentary War Musical Mystery Film-Noir Western
0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3878 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3879 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3880 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3881 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3882 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3883 rows × 18 columns

In [216]:
pd.set_option('display.max_columns', 25)
#pd.set_option('display.max_row', 10)
movies_windic= movies.join(dummies.add_prefix('Genre_'))
movies_windic.head()
Out[216]:
movie_id title genres Genre_Animation Genre_Children's Genre_Comedy Genre_Adventure Genre_Fantasy Genre_Romance Genre_Drama Genre_Action Genre_Crime Genre_Thriller Genre_Horror Genre_Sci-Fi Genre_Documentary Genre_War Genre_Musical Genre_Mystery Genre_Film-Noir Genre_Western
0 1 Toy Story (1995) Animation|Children's|Comedy 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 2 Jumanji (1995) Adventure|Children's|Fantasy 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 3 Grumpier Old Men (1995) Comedy|Romance 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 4 Waiting to Exhale (1995) Comedy|Drama 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 5 Father of the Bride Part II (1995) Comedy 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
In [231]:
np.random.seed(12345)
values = np.random.rand(10)
values
Out[231]:
array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])
In [232]:
bins=[0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))
Out[232]:
(0.0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1.0]
0 0 0 0 0 1
1 0 1 0 0 0
2 1 0 0 0 0
3 0 1 0 0 0
4 0 0 1 0 0
5 0 0 1 0 0
6 0 0 0 0 1
7 0 0 0 1 0
8 0 0 0 1 0
9 0 0 0 1 0
In [243]:
val= ' a,b, guido '
val.split(',') #공백포함됨
Out[243]:
[' a', 'b', ' guido ']
In [244]:
val
Out[244]:
' a,b, guido '
In [245]:
val= 'a,b, guido'
val.strip() #양끝 공백 제거
Out[245]:
'a,b, guido'
In [246]:
pieces = [x.strip() for x in val.split(',')]
pieces
Out[246]:
['a', 'b', 'guido']
In [221]:
first, second, third= pieces 
first+ '::'+ second+ '::'+ third
Out[221]:
'a::b::guido'
In [247]:
first
Out[247]:
'a'
In [249]:
pieces
Out[249]:
['a', 'b', 'guido']
In [222]:
'::'.join(pieces)
Out[222]:
'a::b::guido'
In [223]:
'guido' in val
Out[223]:
True
In [256]:
val.index(',')
Out[256]:
1
In [257]:
val.find(':')
Out[257]:
-1
In [248]:
val.find(',')
Out[248]:
1
In [259]:
val.index(':')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-259-2c016e7367ac> in <module>
----> 1 val.index(':')

ValueError: substring not found
In [261]:
val.count(',')
Out[261]:
2
In [262]:
val.replace(',', '::')
Out[262]:
'a::b:: guido'
In [263]:
val.replace(',', '')
Out[263]:
'ab guido'

'Python' 카테고리의 다른 글

Python_판다스_데이터분석  (0) 2020.09.14
Python_example  (0) 2020.09.11
Python_pandas 문제  (0) 2020.09.09
Python_pandas(판다스):시리즈,데이터프레임,색인,인덱싱,sorting  (0) 2020.09.09
Python 기초09_vectorize  (0) 2020.09.08

+ Recent posts