# DataAnalysis 이론

엘리스

데이터 분석과 인공지능 스터디
데이터 분석과 인공지능
13주차
- 13-01
- 13-02 ✅
- 13-03
- 13-04 ☑️ 포켓몬 데이터 분석 추가하기!!!
- 13-05

# 데이터와 정보

데이터 : 현실 세계의 일을 관찰, 측정해서 얻은 값

정보 : 데이터를 처리해서 얻은 의미있는 값

데이터 분석 : 데이터를 활용하여 원하는 정보를 얻어내기 위한 일련의 과정

# 데이터 분석 프로세스

데이터 분석 프로젝트 : 문제 정의 -> 가설 설정 -> 데이터 준비 -> 데이터 분석 -> 결과 정리

문제 정의 : 풀고자 하는 문제가 무엇인지 명확히 정의

가설 설정 : 문제 해결을 위한 방향 설정, 데이터 분석의 토대로, 문제와의 관련성을 고려해야한다.

데이터 준비 : 문제에 대한 정보를 담고 있는 데이터 셋을 선정, 수집, 전처리(데이터 정제, Data Cleaning)

전처리 과정에서는 빠진 부분, 중복, 이상값 제거, 형태 변환 등 초기 데이터Raw Data를 처리한다.

데이터 분석 : 탐색적 데이터 분석(EDA, Exploratory Data Analysis, 데이터의 특징을 찾고 숨겨진 패턴을 발견)를 통해 본격적인 데이터 분석을 실시한다. 얼마나 데이터를 이해하는지가 가장 중요하다.

EDA : 데이터 분석 단계에 해당하며, 데이터 분포 확인, 변수 간의 관계 파악을 통해 데이터의 특징을 발견하고 이해해야한다.

결과 정리 : 평가, 해결, 개선방향, insight(in+sight, 사물의 이면을 통찰하는것, 의미있는 정보)를 정리한다.

의미있는 데이터 분석은 문제해결을 위해 인사이트를 얻고 개선과 해결방안을 얻어내는 것이다.

명확한 목표 설정과 흐름에 따른 데이터 분석이 진행되어야한다.

# Numpy

Numerical Python. Python에서 대규모 다차원 배열을 다룰 수 있게 도와주는 라이브러리다. 대부분의 데이터(음성, 사진 등)는 숫자 배열이다. python list보다 빠른 연산과 효율적인 메모리 관리가 가능하다.

# array

list(range(10)) # python list

import numpy as np
np.array([1,2,3,4,5]) # numpy array([1,2,3,4,5])
np.array([1.2 ,2,3,4,5]) # array([1.2 ,2. ,3. ,4. ,5. ])
np.array([1,2,3,4], dtype='float') # array([1. ,2. ,3. ,4. ,5. ])
np.array([[1, 2],
          [3, 4]])

1
2
3
4
5
6
7
8

python list와는 다르게 단일 타입으로만 구성된다.

arr = np.array([1,2,3,4], dtype=float)

type(arr) # array
arr.dtype # dtype('float64')
arr.astype(int) # array(1,2,3,4)

1
2
3
4
5

배열 데이터타입

dtype	설명	표현
int	정수	i, int_, int32, int64, i8
float	실수	f, float_, float32, float64, f8
str	문자열	str, U, U32
bool	부울	?, bool_

다양한 배열

np.zeros(5, dtype=int) # [0,0,0,0,0]
np.ones((2, 3), dtype=float) # [1,1,1],[1,1,1]
np.arange(0, 10, 2) # [0,2,4,6,8]
np.linspace(0, 1, 5) # [0, 0.25, 0.5, 0.75, 1]

np.random.random((2,2)) # 인자로 튜플(shape 지정) 2*2 행렬
np.random.normal(0,1,(2,2)) # 평균, 표준편차, shape(2*2)
np.random.randint(0,10,(2,2)) # 0~10 사이, shape(2*2)

1
2
3
4
5
6
7
8

문제

import numpy as np
#0부터 5사이 랜덤한 값이 담긴 3x5 array를 만들어 봅시다!
array=np.random.randint(0,5,(3,5))

1
2
3

TIP

numpy로 배열 만들기

배열의 속성은 다양하다.

x=np.random.randint(0, size=(2, 4)) # array([[2,3,9,0], [4,2,1,7]])

x.ndim # 2(2차원 행렬)
x.shape # (2, 4)
x.size # 8
x.dtype # dtype('int64')

# indexing : 인덱스로 값을 찾아냄
x[0] = 7
# Slicing : 인덱스 값으로 배열의 부분을 가져옴
x[:4] # 0,1,2,3
x[:4:2] # 0,2
matrix[0:2,1:4]) # 2치원일 경우, 인덱스 0부터 인덱스 1까지, 열은 인덱스 1부터 인덱스 3까지

1
2
3
4
5
6
7
8
9
10
11
12
13

# reshape

x=np.arange(8)
x.shape # (8, ) 1차원 배열
x2=x.reshape((2, 4)) # 2차원 배열로 변경 (2, 4)

# concatenate : 이어붙이기
x=np.array([0,1,2])
y=np.array([3,4,5])
np.concatenate([x, y]) # [0,1,2,4,5]

matrix=np.arange(4).reshape(2,2) # [[0,1],[2,3]]
np.concatenate([matrix, matrix], axis=0) # 수직으로 붙이기
np.concatenate([matrix, matrix], axis=1) # 가로로 붙이기

# split : 나누기
upper, lower = np.split(matrix, [1], axis=0) # 1번째 인덱스 아래 가로로 자름

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

# 기본 연산

x=np.arange(4) # 0,1,2,3
x+5 # 5,6,7,8
x-5 # -5,-4,-3,-2
x*5 # 0,5,10,15
x/5 # 0,0.2,0.4,0.6
# 다차원 행렬에서도 사용 가능하다!

1
2
3
4
5
6

# 브로드캐스팅Broadcasting

shape이 다른 array끼리 연산한다. 예를 들면, 3*3행렬인 matrix와 5의 연산이다.

matrix + 5 # [5,5,5,],[5,5,5],[5,5,5]를 더하는 것과 같다!
matrix + np.array([1,2,3]) # [1,2,3],[1,2,3],[1,2,3]를 더하는 것과 같다!

np.arange(3).reshape((3,1)) + np.arange(3)

1
2
3
4

# 집계 함수

x=np.arange(8).reshape(2,4)
np.sum(x) # 28, 합계
np.min(x) # 0
np.max(x) # 9
np.mean(x) # 3.5
np.std(x) # 표준 편차

np.sum(matrix, axis=0) # 세로 방향 합
np.sum(matrix, axis=1) # 가로 방향 합

1
2
3
4
5
6
7
8
9
10

# 마스킹 연산

True, False array를 통해서 특정 값들을 뽑아내는 방법이다.

x=np.arange(5)
x<3 # # [True,True,True,False,False]
x>5 # [False,False,False,False,False]
x[x<3] # 3 미만인 값만 출력

1
2
3
4

양치기 소년의 거짓말 횟수

import numpy as np

daily_liar_data = [0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]

# 양치기 소년이 거짓말을 몇 번 했는지 구하여 출력해주세요.
liar_array=np.array(daily_liar_data)
print(np.size(liar_array[liar_array<1]))

1
2
3
4
5
6
7

# Pandas

구조화된 데이터를 효과적으로 처리하고 저장할 수 있는 파이썬 라이브러리이다. Array 계산에 특화된 numpy를 기반으로 만들어졌다.

# Series

numpy array가 보강된 형태로 Data와 Index를 가지고 있다.

import pandas as pd
data=pd.Series([1,2,3,4])
data2=pd.Series([1,2,3,4], index=['a','b','c','d'])
data2['a'] # 1

1
2
3
4

딕셔너리로 만들 수 있다.

population_dict={
    'korea':5180,
    'china':141500,
    'usa':32676,
}
population==pd.Series(population_dict) # key는 index, value는 data, dtype은 int64
population.values # numpy array

1
2
3
4
5
6
7

# DataFrame

여러개의 Series를 모아서 행과 열을 이룬 데이터이다.

gdp=pd.Series(gdp_dict)
country=pd.DataFrame({
    'population':population,
    'gdp':gdp
})

country.index # ['korea','china','usa']
country.columns # ['gdp', 'population']
type(country['gdp']) # pandas.core.series.Series

gdp_per = country['gdp'] / country['population']
country['gdp_per'] = gdp_per # 추가 가능

1
2
3
4
5
6
7
8
9
10
11
12

저장과 불러오기도 가능하다.

country.to_csv('./country.csv') # comma sparated values
country.to_excel('country.xlse')

country=pd.read_csv('./country.csv')
country=pd.read_excel('country.xlse')

1
2
3
4
5

# Indexing과 Slicing

country.loc['usa'] # usa 데이터만 출력, Name이 usa
country.loc['japan':'korea', :'population']

1
2

loc는 명시적인 인덱스를 참조하는 인덱싱이자 슬라이싱이다.

과학적 표기법 : 1.40925e+09는 *(10^9)를 뜻함

iloc는 파이썬 스타일 정수 인덱스의 인덱싱이자 슬라이싱이다. 결과는 loc와 동일하다.

country.iloc[0]
country.iloc[1:3, :2]

1
2

ix라는 혼합형태가 있었지만 지금은 지원이 중단되었다.

DataFrame에 새로운 데이터를 추가하는 방법도 있다.

df=pd.DataFrame(columns=['이름','나이','주소'])
df.loc[0] = ['박수정','26','서울']
df.loc[0, '주소'] = '경산'

df['전화번호'] = np.nan # 비우기
df.loc[0, '전화번호'] ='010-****-0063' # 추가
len(df) # 1

1
2
3
4
5
6
7

컬럼을 선택할 수 있다. 이때 컬럼이 하나이면 Series, 리스트라면 DataFrame 형태로 나타난다.

누락된 데이터를 체크하는 것도 중요하다.

df.isnull() # nan이나 none이면 True
df.notnull() # nan이나 none이 아니면 True

df.dropna() # 비어있는 경우 그 행 전체 지우기
df['전화번호'] = df['전화번호'].fillina('전화번호 없음') # 비어있는 경우 대체

1
2
3
4
5

# 연산

연산시 비어있는 값(NaN)을 대체해줄 수 있다.

A = pd.DataFrame(np.random.randint(0, 10, (2, 2)), columns=['A', 'B'])      
B = pd.DataFrame(np.random.randint(0, 10, (3, 3)), columns=['B', 'A', 'C'])

A.add(B, fill_value=0) # 0으로 대체
# sub, mul, div 도 가능

1
2
3
4
5

집계함수도 모두 활용 가능하다.

df=pd.DataFrame(data)
df['A'].sum() # 값이 반환됨
df.sum() # 각 값이 합계로 된 df가 반환됨
df.mean() # 각 값이 평균으로 된 df가 반환됨

1
2
3
4

# 정렬

df.sort_values('col1') # 컬럼 값에 따라 오름차순으로 정렬
df.sort_values('col1', ascending=False) # 내림차순
df.sort_values(['col1', 'col2']) # col1 기준 -> 같은 경우 col2 기준

# col2를 기준으로 오름차순으로, col1를 기준으로 내림차순으로 정렬
sorted_df3=df.sort_values(['col2', 'col1'], ascending=[True, False])

1
2
3
4
5
6

# 조건으로 검색

masking 연산이 가능하다.

import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.rand(5, 2), columns=["A", "B"])

# 조건에 맞는 df의 row를 추출
df[(df['A']<0.5) & (df['B']>0.3)] 
df.query("A<0.5 and B>0.3") # 같은 결과

1
2
3
4
5
6
7

만약 문자열이라면 다른 조건 검색도 가능하다.

df['Animal'].str.contains("cat") # 포함하면 True, 포함하지 않으면 False
df.Animal.str.match("cat") # 일치하는지 검사

1
2

# 함수로 데이터 처리

apply

def square(x):
    return x**2

df['Square'] = df['Num'].apply(square)

# 함수 없이 처리 가능
df['Square'] = df.Num.apply(lambda x:x**2)

1
2
3
4
5
6
7

전화가 가능한 번호로 바꾸기 위해서 apply를 활용할 수 있다.

def get_preprocess_phone(phone):
    mapping_dict={
        "공":"0",
        "일":"1",
        "이":"2",
        "삼":"3",
        "사":"4",
        "오":"5",
        "육":"6",
        "칠":"7",
        "팔":"8",
        "구":"9",
        "-":"",
        ".":"",
    }
    for key, value in mapping_dict.items():
        phone=phone.replace(key, value)
    return phone

df['preprocess_phone']=df['phone'].apply(get_preprocess_phone)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

replace

apply 기능에서 데이터 값만 대체하고 싶을 때 사용한다. 함수가 필요없다.

df["Sex"]=df.Sex.replace({'Male':0, "Famale":1})

df.Sex.replace({'Male':0, "Famale":1}, inplace=True) # 그대로 변환할때

1
2
3

# 그룹화

조건부로 집계를 하고 싶을 때 사용한다.

df.groupby['key'].sum() # key를 기준으로 그룹화 후 합계
df.groupby(['key', 'data1']).sum()

1
2

aggregate

그룹화하여 집계를 한번에 계산하는 방법이다.

df.groupby('key').aggregate(['min', np.median, max])
df.groupby('key').aggregate({'data1': min, 'data2': np.sum})

1
2

filter

그룹 속성을 기준으로 데이터를 필터링 하는 방법이다.

def filter_by_mean(x):
    return x['data2'].mean()>3
df.groupby('key').mean() # 평균값
df.groupby('key').filter(filter_by_mean) # True만 가져옴

1
2
3
4

apply

그룹에 묶인 데이터에 함수를 적용한다.

df.groupby('key').apply(lambda x:x.max() - x.min())

get_group

그룹으로 묶인 데이터를 key 값으로 가져올 수 있다.

df.head()
df.groupby('시도').get_group('충남')
len(df.groupby('시도').get_group('충남'))

1
2
3

# MultiIndex & pivot_table

인덱스를 계층적으로 만들 수 있다.

# 행
df=pd.DataFrame(
    np.random.randn(4,2),
    index=[['A','A','B','B'], [1,2,1,2]],
    columns=['data1','data2']
)

1
2
3
4
5
6

# 열
df=pd.DataFrame(
    np.random.randn(4,4),
    columns=[['A','A','B','B'], [1,2,1,2]]
)

df["A"]["1"] # indexing

1
2
3
4
5
6
7

pivot_table

데이터에서 필요한 자료만 뽑아서 새롭게 요약하고 분석할 수 있는 기능이다. 엑셀에서의 피봇 테이블과 같다.

index는 행 인덱스로 들어갈 key이고, column은 열 인덱스로 라벨링이 될 값이다. value는 분석할 데이터가 들어간다.

df.pivot_table(
    index='sex', columns='class', values='survived',
    aggfunc=np.mean # value를 어떤 식으로 채울 것인지
)

1
2
3
4

# Matplotlib

파이썬에서 데이터를 그래프나 차트로 시각화할 수 있는 라이브러리이다.

import matplotlib.pyplot as plt
x=[1,2,3,4,5]
y=[1,2,3,4,5]
plt.plot(x,y)
plt.title("First Plot")
plt.xlabel("x")
plt.ylabel("y")

# fig, ax 수동 생성 : 결과는 같음
fig,ax = plt.subplots()
ax.plot(x,y)
ax.set_title("First Plot")
ax.set_xlabel("x")
ax.set_ylabel("y")
fig.set_dpi(300) # 크기
fig.savefig("fist_plot.png") # 저장해줌

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16

line plot이다.

# matplotlib의 구조

figure : 가장 넓은 부분(그래프를 모두 포함하는 도화지)

title : 제목

x, y label : 라벨

axes : 하나의 그래프

Major tick : 큰 눈금

Minor tick : 작은 눈금

lagend : 범례

여러개의 그래프를 그릴 수 있다.

x=np.linspace(0, np.pi*4, 100) # 0~4pi 까지 100개의 구간
fig, axes=plt.subplots(2,1) # 2개의 그래프
axes[0].plot(x, np.sin(x))
axes[1].plot(x, np.cos(x))

1
2
3
4

# line plot

fig, ax=plt.subplots()
x=np.arange(15)
y=x**2
ax.plot(
    x,y,
    linestyle=":",
    marker="*",
    color="#524FA1"
)

1
2
3
4
5
6
7
8
9

# Line Style
fig, ax=plt.subplots()
ax.plot(x,x,linestyle='-') # solid
ax.plot(x,x+2,linestyle='--') # dashed
ax.plot(x,x+4,linestyle='-.') # dashdot
ax.plot(x,x+6,linestyle=':') # dotted

1
2
3
4
5
6

# Color
ax.plot(x,x,color='r') # rgbcmyk
ax.plot(x,x+2,color='green')
ax.plot(x,x+4,color='0.8') # 회색조
ax.plot(x,x+6,color='#524FA1')

1
2
3
4
5

# Marker
fig, ax=plt.subplots()
ax.plot(x,x,linestyle='.') # 점
ax.plot(x,x+2,linestyle='o') # 동그라미
ax.plot(x,x+4,linestyle='v') # 세모
ax.plot(x,x+6,linestyle='s') # 네모
ax.plot(x,x+8,linestyle='*') # 별

1
2
3
4
5
6
7

# 축 경계 조정
x=np.linspace(0, 10, 1000)
fig, ax=plt.subplots()
ax.plot(x,np.sin(x)) # sin 그래프
ax.set_xlim(-2, 12) # 시작, 끝 값
ax.set_ylim(-1.5, 1.5)

1
2
3
4
5
6

# 범례
fig, ax=plt.subplots()
ax.plot(x,x, label='y=x') 
ax.plot(x,x**2, label='y=x*2') 
ax.set_xlabel('x') # 시작, 끝 값
ax.set_ylabel('y')
ax.lagend(
    loc='upper right', # lower, center / left
    shadow=True, # 범례 긤자
    fancybox=True, # 둥글게
    borderpad=2 # 여백 크기
)

1
2
3
4
5
6
7
8
9
10
11
12

# Scatter

fig, ax=plt.subplots()
x=np.arange(10)
ax.plot(
    x, x**2, 'o', # o 속성은 o형태로 이어지지 않는 그래프가 그려짐
    markersize=15, # 원 크기
    markerfacecolor='white', # 안쪽 색깔
    markeredgecolor='blue' # 바깥쪽 색깔
)

1
2
3
4
5
6
7
8

fig, ax=plt.subplots()
x=np.random.randn(50)
y=np.random.randn(50)
colors=np.random.randint(0,100,50)
sizes=500*np.pi+np.random.rand(50)**2
ax.scatter(
    x, y, c=colors, s=sizes, alpha=0.3 # 점의 중앙 위치, 색깔, 크기, 겹쳐서 보이게하는 속성
)

1
2
3
4
5
6
7
8

# Bar & Histogram

Bar plot

x=np.arange(10)
fig, ax=plt.subplots(figsize=(12,4)) # 가로12, 세로4
ax.bar(x, x*2)

1
2
3

data=[x,y,z]
fig, ax=plt.subplots()
x_ax=np.arange(3) # 그래프 3개
for i in x_ax:
    ax.bar(x_ax, data[i],
    bottom=np.sum(data[:i], axis=0) # 시작 지정
ax.set_xticks(x_ax)
ax.set_xticklabels(['A', 'B', 'C'])

1
2
3
4
5
6
7
8

Histogram

fig, ax=plt.subplots()
data=np.random.randn(1000)
ax.hist(data, bins=50) # 50개로 나눔

1
2
3

# Pandas plot

pandas를 활용하여 그래프를 그리는 방법이 있다.

df=pd.read_csv('./president_height.csv')
fig.ax=plt.subplots()
ax.plot(df['order'], df['height'], label='height')
ax.set_xlabel('order')
ax.set_ylabel('height')

1
2
3
4
5

포켓몬 분석

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("./data/pokemon.csv")

fire = df[
    (df['Type 1']=='Fire') | ((df['Type 2'])=="Fire")
]

water = df[
    (df['Type 1']=='Water') | ((df['Type 2'])=="Water")
]

fig, ax = plt.subplots()
ax.scatter(fire['Attack'], fire['Defense'],
    color='R', label='Fire', marker="*", s=50)
ax.scatter(water['Attack'], water['Defense'],
    color='B', label="Water", s=25)
ax.set_xlabel("Attack")
ax.set_ylabel("Defense")
ax.legend(loc="upper right")

fig.savefig("plot.png")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

DataAnalysis 실습 →