인공지능/데이터분석

데이터 분석 연습2 - 서울시 범죄 현황 분석

해피밀세트 2020. 6. 19. 21:35
반응형

 

 

 

목표

  • 강남 3구의 체감안전도가 높다는 기사 검증
  • 실제 안전도가 높은지 확인
  • 서울시 구별 범죄 발생과 검거율
  • 위의 정보로 어떤 결론을 내리고, 어떻게 시각화할 것인지 고민

 

사용 데이터

1. 서울시 관서별 5대 범죄 발생 검거 현황 : 2015년 (.csv)

 


 

1. pandas를 이용하여 데이터 정리하기

 

# 라이브러리 불러오기

import numpy as np
import pandas as pd

 

# 사용 데이터(csv) 읽어들이기

crime_anal_police = pd.read_csv('C:/datascience_train/data/02. crime_in_Seoul.csv', thousands=',', encoding='euc-kr')
crime_anal_police.head()

 

 

 

2. Google Maps를 이용해서 주소와 위도, 경도 정보 얻기 

 

# Google Maps API 키 얻기

https://cloud.google.com/maps-platform/

 

# pip install googlemaps

 

# 라이브러리 불러오기

import googlemaps

 

# google maps 키 인증

gmaps_key = '******************************'
gmaps = googlemaps.Client(key=gmaps_key)

 

# google maps를 사용해서 '서울중부경찰서' 검색

gmaps.geocode('서울중부경찰서', language='ko')

 

# 경찰서 이름을 '서울oo경찰서'로 만들기

station_name = []

for name in crime_anal_police['관서명']:

    station_name.append('서울' + str(name[:-1] + '경찰서'))

 

station_name

 

# 경찰서 이름을 이용해서 주소 받아오기

station_address = []
station_lat = []
station_lng = []

for name in station_name :

    tmp = gmaps.geocode(name, language='ko')

    station_address.append(tmp[0].get('formatted_address'))

   

    tmp_loc = tmp[0].get('geometry')

    station_lat.append(tmp_loc['location']['lat'])

    station_lng.append(tmp_loc['location']['lng'])

   

    print(name + '-->' + tmp[0].get('formatted_address'))

 

# 주소 확인

station_address[0:5]

 

# 위도 확인

station_lat[0:5]

 

# 경도 확인

station_lng[0:5]

 

# 주소에서 구만 뽑아서 구별 컬럼 만들기

gu_name = []

for name in station_address:

    tmp = name.split()

    tmp_gu = [gu for gu in tmp if gu[-1] == ''][0]

    gu_name.append(tmp_gu)

 

crime_anal_police['구별'] = gu_name

crime_anal_police.head()

 

# 금천경찰서는 관악구에 위치해 있으므로 예외처리

crime_anal_police.loc[crime_anal_police['관서명']=='금천서',['구별']] = '금천구'
crime_anal_police[crime_anal_police['관서명']=='금천서']

 

# 수정 내용 파일로 저장

crime_anal_police.to_csv("C:/datascience_train/data/02. crime_in_Seoul_include_gu_name.csv", sep=',',encoding='utf-8')
crime_anal_police.head()

 

 

 

3. pivot_table을 이용해서 데이터 정리하기

 

# 데이터 불러오기

crime_anal_raw = pd.read_csv("C:/datascience_train/data/02. crime_in_Seoul_include_gu_name.csv", encoding='utf-8')
crime_anal_raw.head()

 

# 관서별 -> 구별

crime_anal_raw = pd.read_csv("C:/datascience_train/data/02. crime_in_Seoul_include_gu_name.csv", encoding='utf-8',

                            index_col=0)

crime_anal = pd.pivot_table(crime_anal_raw, index='구별', aggfunc=np.sum)

crime_anal.head()

 

# 검거율 계산 / 검거 건수 삭제

crime_anal['강간검거율'] = crime_anal['강간 검거']/crime_anal['강간 발생']*100
crime_anal['강도검거율'] = crime_anal['강도 검거']/crime_anal['강도 발생']*100
crime_anal['살인검거율'] = crime_anal['살인 검거']/crime_anal['살인 발생']*100
crime_anal['절도검거율'] = crime_anal['절도 검거']/crime_anal['절도 발생']*100
crime_anal['폭력검거율'] = crime_anal['폭력 검거']/crime_anal['폭력 발생']*100

del crime_anal['강간 검거']
del crime_anal['강도 검거']
del crime_anal['살인 검거']
del crime_anal['절도 검거']
del crime_anal['폭력 검거']

crime_anal.head()

 

# 검거율 중 100이 넘는 숫자를 100으로 처리

con_list = ['강간검거율','강도검거율','살인검거율','절도검거율','폭력검거율']

for column in con_list:

    crime_anal.loc[crime_anal[column] > 100, column] = 100

 

crime_anal.head()

 

# 컬럼 이름에서 '발생' 삭제

crime_anal.rename(columns = {'강간 발생' : '강간',
                            '강도 발생' : '강도',
                            '살인 발생' : '살인',
                            '절도 발생' : '절도',
                            '폭력 발생' : '폭력'}, inplace=True)
crime_anal.head()

 

 

 

4. 데이터 표현을 위해 다듬기

 

# 컬럼별로 정규화

from sklearn import preprocessing

col = ['강간','강도','살인','절도','폭력']

x = crime_anal[col].values

min_max_scaler = preprocessing.MinMaxScaler()

 

x_scaled = min_max_scaler.fit_transform(x.astype(float))

crime_anal_norm = pd.DataFrame(x_scaled, columns = col,

                              index= crime_anal.index)

 

col2 = ['강간검거율', '강도검거율', '살인검거율', '절도검거율', '폭력검거율']

crime_anal_norm[col2] = crime_anal[col2]

crime_anal_norm.head()

 

# 이전에 한 구별 인구수와 CCTV개수 가져오기

result_CCTV = pd.read_csv("C:/datascience_train/data/01. CCTV_result.csv", encoding='UTF-8', index_col='구별')
crime_anal_norm[['인구수','CCTV']] = result_CCTV[['인구수','소계']]
crime_anal_norm.head()

 

# 발생 건수의 합을 '범죄'컬럼으로 만들고 합하기

col = ['강간', '강도', '살인', '절도', '폭력']
crime_anal_norm['범죄'] = np.sum(crime_anal_norm[col], axis=1)
crime_anal_norm.head()

 

# 검거율의 합을 '검거'컬럼으로 만들고 합하기

col = ['강간검거율', '강도검거율', '살인검거율', '절도검거율', '폭력검거율']
crime_anal_norm['검거'] = np.sum(crime_anal_norm[col], axis=1)
crime_anal_norm.head()

 

 

 

5. 범죄 데이터 시각화하기

 

# 그래프에 대한 한글 폰트 문제 해결

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import platform
path = "C:/Windows/Fonts/malgun.ttf"
from matplotlib import font_manager, rc

if platform.system() == 'Darwin':

    rc('font', family='AppleGothic')

elif platform.system() == 'Windows':

    font_name = font_manager.FontProperties(fname=path).get_name()

    rc('font', family=font_name)

else:

    print('Unknown system.. Sorry~~~~')

 

# pairplot으로 강도,살인,폭력 간의 상관관계 그래프로 표현

sns.pairplot(crime_anal_norm, vars=["강도","살인","폭력"], kind='reg', size=3)
plt.show()

 

# 인구수와 CCTV개수, 살인, 강도의 상관관계

sns.pairplot(crime_anal_norm, x_vars=['인구수', 'CCTV'],
            y_vars=['살인','강도'], kind='reg', size=3)

plt.show()

 

# 인구수와 CCTV개수, 살인검거율, 폭력검거율의 상관관계

sns.pairplot(crime_anal_norm, x_vars=['인구수', 'CCTV'],
            y_vars=['살인검거율','폭력검거율'], kind='reg', size=3)

plt.show()

 

# 검거 컬럼 최고값을 100으로 한정하고 그 값을 정렬

tmp_max = crime_anal_norm['검거'].max()
crime_anal_norm['검거'] = crime_anal_norm['검거'] / tmp_max * 100
crime_anal_norm_sort = crime_anal_norm.sort_values(by='검거', ascending=False)
crime_anal_norm_sort.head()

 

# heatmap 그리기

target_col = ['강간검거율', '강도검거율', '살인검거율', '절도검거율', '폭력검거율']
crime_anal_norm_sort = crime_anal_norm.sort_values(by='검거', ascending=False)

plt.figure(figsize = (10,10))

ax = sns.heatmap(crime_anal_norm_sort[target_col], annot=True, fmt='f', linewidths=.5)

buttom,top = ax.get_ylim()

ax.set_ylim(buttom+0.5, top-0.5)

plt.title('범죄 검거 비율 (정규화된 검거의 합으로 정렬)')

plt.show()

 

# 범죄 발생 건수의 합으로 정렬해서 heatmap 확인

target_col = ['강간', '강도', '살인', '절도', '폭력', '범죄']
crime_anal_norm['범죄'] = crime_anal_norm['범죄'] / 5
crime_anal_norm_sort = crime_anal_norm.sort_values(by='범죄', ascending=False)

plt.figure(figsize = (10,10))

ax = sns.heatmap(crime_anal_norm_sort[target_col], annot=True, fmt='f', linewidths=.5)

buttom,top = ax.get_ylim()

ax.set_ylim(buttom+0.5, top-0.5)

plt.title('범죄 비율 (정규화된 발생 건수로 정렬)')

plt.show()

 

# csv 파일로 저장

crime_anal_norm.to_csv("C:/datascience_train/data/02. crime_in_Seoul_final.csv", sep=',', encoding='utf-8')

 

 

 

6. 서울시 범죄율에 대한 지도 시각화

 

# 한국 지도 json 파일 불러오기

import json
import folium
geo_path = "C:/datascience_train/data/02. skorea_municipalities_geo_simple.json"
geo_str = json.load(open(geo_path, encoding='utf-8'))

 

# 서울시 중심의 위도와 경도 정보 입력하고 경계선 그리기
# 살인 발생 건수로 확인

map = folium.Map(location=[37.5502, 126.982], zoom_start=11, tiles='Stamen Toner')
map.choropleth(geo_data = geo_str, data = crime_anal_norm['살인'],
               columns = [crime_anal_norm.index, crime_anal_norm['살인']],
               fill_color = 'PuRd', key_on='feature.id')
map

 

# 강간 발생 건수로 확인

map = folium.Map(location=[37.5502, 126.982], zoom_start=11, tiles='Stamen Toner')
map.choropleth(geo_data = geo_str, data = crime_anal_norm['강간'],
               columns = [crime_anal_norm.index, crime_anal_norm['강간']],
               fill_color = 'PuRd', key_on='feature.id')
map

 

# 전체 범죄 발생 건수로 확인

map = folium.Map(location=[37.5502, 126.982], zoom_start=11, tiles='Stamen Toner')
map.choropleth(geo_data = geo_str, data = crime_anal_norm['범죄'],
               columns = [crime_anal_norm.index, crime_anal_norm['범죄']],
               fill_color = 'PuRd', key_on='feature.id')
map

 

# 인구 대비 살인 발생 비율

tmp_criminal = crime_anal_norm['살인'] / crime_anal_norm['인구수'] * 1000000

map = folium.Map(location=[37.5502, 126.982], zoom_start=11, tiles='Stamen Toner')

map.choropleth(geo_data = geo_str, data = tmp_criminal,

               columns = [crime_anal.index, tmp_criminal],

               fill_color = 'PuRd', key_on='feature.id')

map

 

 

 

7. 서울시 경찰서별 검거율과 구별 범죄 발생율을 동시에 시각화하기

 

# 검거율만 따로 모아두기

crime_anal_raw['lat'] = station_lat
crime_anal_raw['lng'] = station_lng

col = ['살인 검거', '강도 검거', '강간 검거', '절도 검거', '폭력 검거']

tmp = crime_anal_raw[col] / crime_anal_raw[col].max()

crime_anal_raw['검거'] = np.sum(tmp, axis=1)

crime_anal_raw.head()

 

# 지도에서 경찰서 위치 확인

map = folium.Map(location=[37.5502, 126.982], zoom_start=11)

for n in crime_anal_raw.index:

    folium.Marker([crime_anal_raw['lat'][n],

                  crime_anal_raw['lng'][n]]).add_to(map)

map

 

# 검거율을 원으로 표시

map = folium.Map(location=[37.5502, 126.982], zoom_start=11)

for n in crime_anal_raw.index:

    folium.CircleMarker([crime_anal_raw['lat'][n], crime_anal_raw['lng'][n]],

                 radius=crime_anal_raw['검거'][n]*10,

                 color='#3186cc', fill_color='#3186cc').add_to(map)

map

 

# 범죄 발생 건수와 검거율 동시에 표시

map = folium.Map(location=[37.5502, 126.982], zoom_start=11)

map.choropleth(geo_data = geo_str,

               data = crime_anal_norm['범죄'],

               columns = [crime_anal_norm.index, crime_anal_norm['범죄']],

               fill_color = 'PuRd', key_on='feature.id')

 

for n in crime_anal_raw.index:

    folium.CircleMarker([crime_anal_raw['lat'][n], crime_anal_raw['lng'][n]],

                 radius=crime_anal_raw['검거'][n]*10,

                 color='#3186cc', fill_color='#3186cc').add_to(map)

map

반응형