ものともしれない日々

参考にできるかかもしれない備忘録

pandas apply関数の練習

pandasの操作が結構複雑なので練習してみた

■会員毎の月間利用日数を集計

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date, timedelta
from dateutil.relativedelta import relativedelta

df = pd.read_csv('./input/nyukan_data_csv.csv')

# 計算対象のカラムをdatatime型に変換
df['check'] = pd.to_datetime(df['check'])
df['join_date'] = pd.to_datetime(df['join_date'])

# 元データをコピー
df_tmp = df.copy()
#df_tmp

# 会員の入会日に合わせての1ヶ月おきの日付をカラムで持たせる(今回は10ヶ月分)
for cn in range(0, 11):
    column_name = 'month_add+{}'.format(cn)
    df_tmp[column_name] =  df_tmp['join_date'].apply(lambda x: x + relativedelta(months=cn))
#df_tmp


# 結果格納用の変数(適当に初期化)
df_res = [[]]

# 会員毎の月間利用日数を集計
for cn in range(0, 10):
    month_start = 'month_add+{}'.format(cn)
    month_end = 'month_add+{}'.format(cn+1)
    df_tmp_grouped = df_tmp.groupby("id").apply(lambda d: ((d['check'] >= d[month_start]) & (d['check'] < d[month_end])).sum()).reset_index()
    df_tmp_grouped = df_tmp_grouped.rename(columns={0:month_end})
    # print(cn)
    #df_res
    #df_tmp_grouped
    # 列を結合
    if (cn == 0):
        df_res = df_tmp_grouped.copy()
    else:
        df_res = pd.merge(df_res, df_tmp_grouped, on='id')
# 結果
df_res
    
#df_grouped = df.groupby("id").apply(lambda d: (1 if (d.check >= d.join_date) & (d.check <= d.join_date + relativedelta(month=1)) else 0))
#df_grouped = df.groupby("id").filter(lambda d: (d['check'] >= d['join_date']) & (d['check'] <= d['join_date'] + relativedelta(month=1)))
#df_grouped = df.groupby("id").query('{} == "X"'.format(s))
#print(df_grouped.groups)
#df_grouped.get_group(1)
#df_grouped.describe().stack()
#df_grouped = df_grouped.apply(date_func, aaxis=1)
#df_grouped

■会員毎の月間利用時間の平均を出力(編集中…

 # コピー
df_tmp = df.copy()
# 利用時間を計算
df_tmp['check'] = df_tmp['check'].apply(lambda x: x.strftime('%Y-%m-%d'))
df_tmp['start_time'] = df_tmp['start_time'].apply(lambda x: str(x))
df_tmp['end_time'] = df_tmp['end_time'].apply(lambda x: str(x))
df_tmp['start_time'] = pd.to_datetime(df_tmp['check'].str.cat(df_tmp['start_time'], sep=' '), format='%Y-%m-%d %H%M')
df_tmp['end_time'] = pd.to_datetime(df_tmp['check'].str.cat(df_tmp['end_time'], sep=' '), format='%Y-%m-%d %H%M')
df_tmp['use_time'] = df_tmp['end_time'] - df_tmp['start_time']
# df_tmp

# datatime型に戻す
df_tmp['check'] = pd.to_datetime(df_tmp['check'])
#df_tmp

# 会員の入会日に合わせての1ヶ月おきの日付をカラムで持たせる(今回は10ヶ月分)
for cn in range(0, 11):
    column_name = 'month_add+{}'.format(cn)
    df_tmp[column_name] =  df_tmp['join_date'].apply(lambda x: x + relativedelta(months=cn))
df_tmp


# # 結果格納用の変数(適当に初期化)
# df_res = [[]]

# # 会員毎の月間利用日数を集計
# for cn in range(0, 10):
#     month_start = 'month_add+{}'.format(cn)
#     month_end = 'month_add+{}'.format(cn+1)
#     df_tmp_grouped = df_tmp.groupby("id").apply(lambda d: ((d['check'] >= d[month_start]) & (d['check'] < d[month_end])).loc['use_time'].sum()).reset_index()
#     df_tmp_grouped = df_tmp_grouped.rename(columns={0:month_end})
#     # print(cn)
#     #df_res
#     #df_tmp_grouped
#     # 列を結合
#     if (cn == 0):
#         df_res = df_tmp_grouped.copy()
#     else:
#         df_res = pd.merge(df_res, df_tmp_grouped, on='id')
# # 結果
# df_res