Coding/Python

Pandas 라이브러리 모음

폴밴 2022. 2. 21. 14:33

Pandas

import numpy as np
import pandas as pd
labels = ['a','b','c']
my_list = [10,20,30]
arr = np.array([10,20,30])
my_dict = {'a' : 10, 'b' : 20, 'c' : 30}

Series

pd.Series(data = my_list)
pd.Series(data = my_list, index = labels)
pd.Series(arr)
pd.Series(arr,labels)
pd.Series(my_dict)
ser_1 = pd.Series([1,2,3,4], index = ['A','B','C','D'])
ser_1
ser_2 = pd.Series([4,3,2,1],['B','C','D','F'])
ser_2
ser_1 + ser_2  # index가 하나만 있을 경우 NaN

DataFrame

Column Selection

df = pd.DataFrame(np.random.randn(5,4), index = 'A B C D E'.split(), 
	columns = 'W X Y Z'.split())  # use split() to make index or columns
    
df['W']
df[['W','Z']]  # select multiple columns by list
df['New'] = df['W'] + df['Z']
df

Drop Column

df.drop('New', axis = 1)  # axis = 1 : column
df.drop('E',axis = 0)  # axis = 0 : row
df
df.drop('New', axis = 1, inplace = True)  # [inplace = True] : apply to dataframe
df

Row Selection

df.loc['A']
df.iloc[0]
df.loc['B','Y']
df.loc[['A','B'],['X','Y']]

Conditional Selection

df > 0
df[df['X'] > 0]
df[df['X'] > 0][['Y','Z']]
df[(df['X'] > 0) & (df['Y'] < 0)]
df[(df['X'] > 0) | (df['Y'] < 0)]

Reset Index

df.reset_index()  # reset index to default

Set Index

alphabet = 'AB CD EF GH IK'.split()
df['new'] = alphabet
df
df.set_index('new', inplace = True)  # set new index with existing column
df

Multi-index

outside = 'G1 G1 G1 G2 G2 G2'.split()
inside = [1, 2, 3, 1, 2, 3]
hier_index = list(zip(outside, inside))  # making a list with (outside, inside) tuples
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index
df = pd.DataFrame(np.random.randn(6,2), index = hier_index, columns = ['A', 'B'])
df
df.loc['G1']
df.loc['G1'].loc[1]
df.index.names = ['Group', 'Index']  # column name of multi-index
df
df.xs('G1')
df.xs(('G1',1))
df.xs(1, level = 'Index')  # select multiple rows with different group index

Missing data

df = pd.DataFrame({'A' : [1, 2, np.nan],
                   'B' : [3, np.nan, np.nan],
                   'C' : [4, 5, 6]})

df.dropna(axis = 1) # drop column which has NaN value
df.dropna(thresh = 2)  # accept less than 2 NaN value
df.fillna(value = 'FILL')  # fill Nan value with value = 'FILL'
df['A'].fillna(value=df['A'].mean(),inplace = True)  # fill Nan value with mean
df

Groupby

# 여러개의 row를 column 기준으로 묶기

df = pd.DataFrame({'Comp' : ['Google', 'Google', 'Microsoft', 'Microsoft', 'Apple', 'Apple'],
      'Name' : ['Charlie', 'Sam', 'Jake', 'Paul', 'Ben', 'Kate'],
      'Sales' : [100, 200, 300, 400, 500, 600]})
df
df.groupby('Comp')
df.groupby('Comp').mean()  # compute mean grouped by Comp
df.groupby('Comp').std()
df.groupby('Comp').min()
df.groupby('Comp').max()
df.groupby('Comp').count()
df.groupby('Comp').describe()  # show meaningful data of groups
df.groupby('Comp').describe()
df.groupby('Comp').describe().transpose()['Apple']

Concatenation

df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])

pd.concat([df1, df2, df3]) # row로 붙이기
pd.concat([df1, df2, df3], axis = 1) # column으로 붙이기

Merging

left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})

pd.merge(left, right, how = 'inner', on = 'key')

Joining

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
 'B': ['B0', 'B1', 'B2']},
  index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
 index=['K0', 'K2', 'K3'])
 
left.join(right)
left.join(right, how = 'outer')

Unique Value, Conditional Selection

df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})

df['col2'].unique()  # unique values
df['col2'].nunique()  # number of unique values
df['col2'].value_counts()  # unique value occurrence
df[(df['col1'] > 2) & (df['col2'] == 444)]

Applying Functions

def times2(x):
    return x * 2

df['col1'].apply(times2)
df['col3'].apply(len)
df['col1'].sum()
del df['col1']  # permanently delete column
df.columns
df.index
df.sort_values(by='col2')
df.isnull()

Data Input / Output

df = pd.read_csv('file name')  # input
pd.read_excel('file name.xlsx',sheetname='Sheet1')
df.to_csv('file name', index = False)  # output
df.to_excel('file name.xlsx',sheet_name='Sheet1'