pandas apply関数の練習
pandasの操作が結構複雑なので練習してみた
■会員毎の月間利用日数を集計
import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from datetime import datetime, date, timedelta from dateutil.relativedelta import relativedelta df = pd.read_csv('./input/nyukan_data_csv.csv') # 計算対象のカラムをdatatime型に変換 df['check'] = pd.to_datetime(df['check']) df['join_date'] = pd.to_datetime(df['join_date']) # 元データをコピー df_tmp = df.copy() #df_tmp # 会員の入会日に合わせての1ヶ月おきの日付をカラムで持たせる(今回は10ヶ月分) for cn in range(0, 11): column_name = 'month_add+{}'.format(cn) df_tmp[column_name] = df_tmp['join_date'].apply(lambda x: x + relativedelta(months=cn)) #df_tmp # 結果格納用の変数(適当に初期化) df_res = [[]] # 会員毎の月間利用日数を集計 for cn in range(0, 10): month_start = 'month_add+{}'.format(cn) month_end = 'month_add+{}'.format(cn+1) df_tmp_grouped = df_tmp.groupby("id").apply(lambda d: ((d['check'] >= d[month_start]) & (d['check'] < d[month_end])).sum()).reset_index() df_tmp_grouped = df_tmp_grouped.rename(columns={0:month_end}) # print(cn) #df_res #df_tmp_grouped # 列を結合 if (cn == 0): df_res = df_tmp_grouped.copy() else: df_res = pd.merge(df_res, df_tmp_grouped, on='id') # 結果 df_res #df_grouped = df.groupby("id").apply(lambda d: (1 if (d.check >= d.join_date) & (d.check <= d.join_date + relativedelta(month=1)) else 0)) #df_grouped = df.groupby("id").filter(lambda d: (d['check'] >= d['join_date']) & (d['check'] <= d['join_date'] + relativedelta(month=1))) #df_grouped = df.groupby("id").query('{} == "X"'.format(s)) #print(df_grouped.groups) #df_grouped.get_group(1) #df_grouped.describe().stack() #df_grouped = df_grouped.apply(date_func, aaxis=1) #df_grouped
■会員毎の月間利用時間の平均を出力(編集中…
# コピー df_tmp = df.copy() # 利用時間を計算 df_tmp['check'] = df_tmp['check'].apply(lambda x: x.strftime('%Y-%m-%d')) df_tmp['start_time'] = df_tmp['start_time'].apply(lambda x: str(x)) df_tmp['end_time'] = df_tmp['end_time'].apply(lambda x: str(x)) df_tmp['start_time'] = pd.to_datetime(df_tmp['check'].str.cat(df_tmp['start_time'], sep=' '), format='%Y-%m-%d %H%M') df_tmp['end_time'] = pd.to_datetime(df_tmp['check'].str.cat(df_tmp['end_time'], sep=' '), format='%Y-%m-%d %H%M') df_tmp['use_time'] = df_tmp['end_time'] - df_tmp['start_time'] # df_tmp # datatime型に戻す df_tmp['check'] = pd.to_datetime(df_tmp['check']) #df_tmp # 会員の入会日に合わせての1ヶ月おきの日付をカラムで持たせる(今回は10ヶ月分) for cn in range(0, 11): column_name = 'month_add+{}'.format(cn) df_tmp[column_name] = df_tmp['join_date'].apply(lambda x: x + relativedelta(months=cn)) df_tmp # # 結果格納用の変数(適当に初期化) # df_res = [[]] # # 会員毎の月間利用日数を集計 # for cn in range(0, 10): # month_start = 'month_add+{}'.format(cn) # month_end = 'month_add+{}'.format(cn+1) # df_tmp_grouped = df_tmp.groupby("id").apply(lambda d: ((d['check'] >= d[month_start]) & (d['check'] < d[month_end])).loc['use_time'].sum()).reset_index() # df_tmp_grouped = df_tmp_grouped.rename(columns={0:month_end}) # # print(cn) # #df_res # #df_tmp_grouped # # 列を結合 # if (cn == 0): # df_res = df_tmp_grouped.copy() # else: # df_res = pd.merge(df_res, df_tmp_grouped, on='id') # # 結果 # df_res