실전 예제 - 온-오프라인 비지니스 분석 10

빅데이터/Data-Analysis 2022. 3. 2. 17:27

패스트캠퍼스 '직장인을 위한 파이썬 데이터분석 올인원 패키치 Online' 참조

01 상품 주문 고객들 확인 (olist-customer_dataset)

df = pd.read_csv('olist_customers_dataset.csv')
df.head()

# 데이터 확인
df.info()
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
 
 # 결측치
df.isnull().sum()
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0

# state별 고객 수
cnt_state = pd.DataFrame(df['customer_state']).value_counts().reset_index()
cnt_state

# 컬럼 열 변경
cnt_state.columns = ['states', 'people_lives']
cnt_state

# 고객 수 시각화 : 막대그래프
plt.figure(figsize=(15,6))
sns.barplot(x='states', y='people_lives', data=cnt_state);

▶ 거주 비율

# 도시 별 고객 비율 (퍼센트)
# nomarlize - 비율 값을 줌 
df['customer_city'].value_counts(normalize=True)*100
sao paulo            15.627357
rio de janeiro        6.920687
belo horizonte        2.788588
brasilia              2.142979
curitiba              1.529550
                       ...    
                       
# 도시별 고객이 살고 있는 비율 상위 10개
df_customer_city = pd.DataFrame(df['customer_city'].value_counts(normalize=True)*100).reset_index()
df_customer_city.columns=['city','people_lives_perc']
df_customer_city.loc[:10,:]

labels = df_customer_city['city'].values[:10]
sizes = df_customer_city['people_lives_perc'].values[:10]

explode = (0.1, 0.1, 0, 0, 0,
           0, 0, 0, 0, 0)  

fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, 
        explode=explode, 
        autopct='%1.1f%%',
        shadow=True, startangle=30,
       textprops={'fontsize': 13})

ax1.axis('equal')  

plt.tight_layout()
plt.title('Ratio of Customer in each city', fontsize=20)
plt.show()

02 실제 주문 상품 데이터 확인 (olist_order_items_dataset)

df = pd.read_csv('olist_order_items_dataset.csv')
df.head()

# 데이터 확인
df.info()
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order_id             112650 non-null  object 
 1   order_item_id        112650 non-null  int64  
 2   product_id           112650 non-null  object 
 3   seller_id            112650 non-null  object 
 4   shipping_limit_date  112650 non-null  object 
 5   price                112650 non-null  float64
 6   freight_value        112650 non-null  float64
 
 # 결측치 확인
df.isnull().sum()
order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0

이 데이터에서 흥미로운 열이 'order_item_id'이다. 웹 개발을 해봤던 사람이라면 왜 이런 번호가 붙는지 이미 알 고 있을 것이다. UUID 개념으로 쇼핑몰 웹사이트를 만들 때 고객ID가 있을것이고 그 고객ID가 주문하는 주문 ID가 있을것이고 그 주문ID가 가지고 있는 주문물품에 대한 ID를 말한다.
이것을 조금 더 분석 해보면 아래와 같다

# 'order_item_id' 이해 하기
# 1. order_item_id를 가장 많이 갖고 있는 order_id를 출력
# order_id 별로 그룹 화 이후 그 그룹화 된 데이터에 order_item_id의 개수를 각각 샘
temp = pd.DataFrame(df.groupby(by=['order_id'])['order_item_id'].count().reset_index())
temp.columns = ['order_id', 'order_item 수']
temp[temp['order_item 수'] == temp['order_item 수'].max()]

# 위의 데이터를 확인
df[df['order_id'] == '8272b63d03f5f79c56e9e4120aec44ef']

즉 위에서 설명 했듯이 'order_item_id'는 각각 주문되어지는 상품마다 고유 id가 붙어 식별이 되는것

# 하나의 주문 번호에서 상품별 매출액 산출

df_qt = pd.DataFrame(df.groupby(by=['order_id', 'product_id'])['order_item_id'].count().reset_index())

# 컬럼 추가
df_qt.columns = ['order_id', 'product_id', 'order_prod_quantity']
df_qt

df_qt[df_qt['order_id']=='8272b63d03f5f79c56e9e4120aec44ef']

# 새로운 DF 병합 후 확인
df_order_item_col = ['order_id', 'product_id', 'seller_id',
                    'shipping_limit_date', 'price', 'freight_value']

# merge
# df, df_qt로 병합하는데 키는 on='', inner조인이기 때문에 df,df_qt의 데이터를 둘다 가지고 있는 것들만 출력 됨
df = pd.merge(df[df_order_item_col], df_qt, how='inner', on=['order_id', 'product_id'])

df = df[['order_id','product_id','price','freight_value',
               'order_prod_quantity','shipping_limit_date','seller_id']]
df

# 병합시 데이터 중복 제거
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df

처음에는 각 order_id 당 몇개의 주문을 했는지 몰랐는데 이렇게 전처리 이후 한번에 원하는 데이터를 함꼐 볼 수 있다.

▶ 실제 주문 상품 데이터에서 전처리가 되어 오더 ID에 따른 상품 개수까지 볼 수 있도록 처리 했다. 이제는 이 데이터를 기준으로 매출액, 상품 내용등을 확인 해 보자

# 상품 별 매출액 
df['order_amount'] = df['price'] * df['order_prod_quantity']
df.reset_index(drop=True, inplace=True)
df

03 주문 상품 들여다 보기 (olist_order_items_dataset, olist_products_dataset, product_category_name_translation)

실제 주문 내역 테이블들을 기준으로 상품 정보를 매칭 하여야 한다

# 데이터 불러오기
df_product = pd.read_csv('olist_products_dataset.csv')
df_product.head()

# 데이터2 
df_category = pd.read_csv('product_category_name_translation.csv')
df_category

# 데이터 확인
df_product.info()
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   product_id                  32951 non-null  object 
 1   product_category_name       32341 non-null  object 
 2   product_name_lenght         32341 non-null  float64
 3   product_description_lenght  32341 non-null  float64
 4   product_photos_qty          32341 non-null  float64
 5   product_weight_g            32949 non-null  float64
 6   product_length_cm           32949 non-null  float64
 7   product_height_cm           32949 non-null  float64
 8   product_width_cm            32949 non-null  float64
 
 # 결측치
df_product.isnull().sum()
product_id                      0
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2

결측치들이 존재 한다.
'product_category_name' 이 포루투갈어 이므로 영어 명칭 테이블과 매칭을 시켜 줘야 한다.

# product_category_name 컬럼 영어 명칭 테이블과 매칭
# 상품정보 테이블 기준으로 결합이므로 left 조인
# 공통된 키(데이터)는 'product_category_name'
df_product_cat = pd.merge(df_product, df_category, how='left',
                          on=['product_category_name'])

# 칼럼 순서 재배치
df_product_cat = df_product_cat[['product_id', 'product_category_name', 'product_category_name_english',
                   'product_name_lenght','product_description_lenght',
                   'product_photos_qty', 'product_weight_g',
                   'product_length_cm', 'product_height_cm', 'product_width_cm']]

df_product_cat

이제 이 데이터를 주문 테이블과 결합 하자

# 주문 테이블과 결합
df = pd.merge(df, df_product_cat, how='left', on=['product_id'])

# 결측치 
df.isnull().sum()
order_id                            0
product_id                          0
price                               0
freight_value                       0
order_prod_quantity                 0
shipping_limit_date                 0
seller_id                           0
order_amount                        0
product_category_name            1460
product_category_name_english    1482
product_name_lenght              1460
product_description_lenght       1460
product_photos_qty               1460
product_weight_g                   16
product_length_cm                  16
product_height_cm                  16
product_width_cm                   16

결합이 많다보니 결측치 값도 점점 늘어 났다. 해결 하자.

# null값 row 출력
df[df.isnull().any(axis=1)]

# 결측치 제거
df_order_item_prod_clean = df.dropna(axis=0)
df_order_item_prod_clean.reset_index(drop=True,inplace=True)
df_order_item_prod_clean

# 결측치 확인
df_order_item_prod_clean.isnull().sum()
order_id                         0
product_id                       0
price                            0
freight_value                    0
order_prod_quantity              0
shipping_limit_date              0
seller_id                        0
order_amount                     0
product_category_name            0
product_category_name_english    0
product_name_lenght              0
product_description_lenght       0
product_photos_qty               0
product_weight_g                 0
product_length_cm                0
product_height_cm                0
product_width_cm                 0

04 최종 데이터를 가지고 분석

# 소수점 출력 설정
pd.options.display.float_format = '{:.2f}'.format

# 상품카테고리 별 주문수 확인
df_order_item_prod_clean['product_category_name_english'].value_counts()

# 상품 카테고리 주문수 탑 10
cat_top10 = df_order_item_prod_clean['product_category_name_english'].value_counts()[:10]
cat_top10.index
Index(['bed_bath_table', 'health_beauty', 'sports_leisure',
       'computers_accessories', 'furniture_decor', 'housewares',
       'watches_gifts', 'telephony', 'auto', 'toys'],
      dtype='object')
      
# 상품 카테고리 주문수 탑10에 포함되는 row만 출력
df_cat_10 = df_order_item_prod_clean[df_order_item_prod_clean['product_category_name_english'].isin(cat_top10.index)].reset_index(drop=True)
df_cat_10

# 상위 10개의 카테고리별 정보 - 상품등록정보
cat_info1_col = ['product_name_lenght','product_description_lenght','product_photos_qty']

df_cat_10.groupby('product_category_name_english')[cat_info1_col].mean()

# 요약 정보 확인
df_cat_10[cat_info1_col].describe()

예로 상품 설명의 길이는 664정도인데 'health_beauty'의 상품 설명은 1111로 다소 길다. 즉 사람들이 건강 뷰티 상품의 설명은 많이 읽거나 자주 본다는 뜻

# 2016~2018년 까지 어떤 상품 카테고리의 매출이 높은지 확인
# 카테고리 종류 수
print("상품 카테고리 종류 수 : {} 종류".format(len(df_order_item_prod_clean['product_category_name_english'].unique())))
상품 카테고리 종류 수 : 71 종류

# 매출액 기준 상품 카테고리
# 그룹바이를 by=''기준으로 짓고, 그 이후 'order_amount'의 합을 같이 출력
temp = pd.DataFrame(df_order_item_prod_clean.groupby(by=['product_category_name_english'])['order_amount'].sum())

temp = temp.sort_values(by='order_amount', ascending=False)
temp

# 이를 시각화
# 상위 10개만 
plt.figure(figsize=(12,10))
squarify.plot(sizes=temp['order_amount'][:10],#사이즈는 상위 10개만 들고 오도록,
             label=temp.index.values[:10], alpha=.7)

시각화로든 데이터로든 'health_beauty'가 가장 높은 주문수를 들고 있다.

▶ 주문수와 매출액의 관계

# 주문수와 매출액의 관계 보기

# 카테고리별 주문수 확인
df_cat_order_cnt = pd.DataFrame(df_order_item_prod_clean['product_category_name_english'].value_counts())
df_cat_order_cnt = df_cat_order_cnt.reset_index()
df_cat_order_cnt.columns = ['category', 'order_cnt']
df_cat_order_cnt

# 카테고리별 매출액 확인
df_cat_amount = pd.DataFrame(df_order_item_prod_clean.groupby(by=['product_category_name_english'])['order_amount'].sum())
df_cat_amount = df_cat_amount.sort_values(by='order_amount', ascending=False)
df_cat_amount = df_cat_amount.reset_index()
df_cat_amount.columns = ['category', 'order_amount']
df_cat_amount

# 두 카데고리 비율화
# 카테고리별 주문 비율

df_cat_order_cnt['order_cnt_perc'] = (df_cat_order_cnt['order_cnt'] / sum(df_cat_order_cnt['order_cnt']))*100
df_cat_order_cnt

df_cat_amount['order_amount_perc'] = (df_cat_amount['order_amount'] / sum(df_cat_amount['order_amount']))*100
df_cat_amount

# 카테고리별 주문수와 매출액 테이블 결합

df_cat = pd.merge(df_cat_order_cnt, df_cat_amount,
        how='inner', on='category')
df_cat

# 매출액 기준으로 sorting

df_cat = df_cat.sort_values(by='order_amount', ascending=False)
df_cat = df_cat.reset_index(drop=True)
df_cat

# melt
# 매출액 기준 상위 10개 카테고리만 출력

df_cat_melt = pd.melt(df_cat[:10], 
                      id_vars=['category'], 
                      value_vars=['order_cnt_perc','order_amount_perc'])
df_cat_melt

# barplot으로 시각화

plt.figure(figsize=(12,10))
ax = sns.barplot(data = df_cat_melt,
                 x="category",
                 y="value",
                 hue="variable",
                 color="salmon"
                )
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
plt.show()

저작자표시 (새창열림)

'빅데이터 > Data-Analysis' 카테고리의 다른 글

실전 예제 - 시계열 분석 12 (0)	2022.03.03
실전 예제 - 온-오프라인 비지니스 분석 11 (0)	2022.03.02
실전 예제 - 온-오프라인 비지니스 분석 09 (0)	2022.03.01
실전 예제 - 코로나 데이터 분석 08 (0)	2022.02.28
실전 예제 - 유투브 채널 및 영상 분석 07 (0)	2022.02.26

ABOUT ME

Treasure Treasure

'빅데이터 > Data-Analysis' 카테고리의 다른 글

티스토리툴바

ABOUT ME

'빅데이터 > Data-Analysis' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바