import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
def validateDir(path: str):
assert os.path.exists(path) and os.path.isdir(path), '경로에 디렉토리가 존재하지 않습니다.'
def convert_to_numeric(df):
numeric_feature = df.columns[(df.dtypes==int) | (df.dtypes== float)]
categorical_feature = df.columns[df.dtypes=='O']
print("수치형 컬럼 : ", list(numeric_feature))
print("카테고리형 컬럼 : ", list(categorical_feature))
for col in categorical_feature:
if isinstance(df[col][0], int) or (isinstance(df[col][0],str) and df[col][0].isnumeric()):
df[col] = pd.to_numeric(df[col])
print("숫자값을 수치형 컬럼으로 변환합니다.")
print(df.dtypes)
def check_missing_col(dataframe):
missing_col = []
counted_missing_col = 0
for i, col in enumerate(dataframe.columns):
missing_values = sum(dataframe[col].isna())
is_missing = True if missing_values >= 1 else False
if is_missing:
counted_missing_col += 1
print(f'결측치가 있는 컬럼: {col}')
print(f'해당 컬럼의 결측치 수: {missing_values}')
missing_col.append([col, dataframe[col].dtype])
if counted_missing_col == 0:
print('결측치가 존재하지 않습니다')
return missing_col
def show_mean_duration_for(df, min_vus = 0, max_vus = 10000000, min_conflict_rate=0, max_conflict_rate=10000000, groupby_columns = []):
print(f'\n(min_vus : {min_vus}, max_vus : {max_vus}, min_conflict_rate : {min_conflict_rate:.2f}, max_conflict_rate : {max_conflict_rate:.2f}) 조건에서의 통계 분석')
df = df.loc[(df.vus >= min_vus)&(df.vus <= max_vus)&(df.conflict_rate >= min_conflict_rate)&(df.conflict_rate <= max_conflict_rate),:]
print('\nduration 순으로 상위 5개')
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df.sort_values(['duration']).head(5))
if groupby_columns:
print('\n',groupby_columns, '별 duration 평균값')
print(df.loc[:, ['duration']+groupby_columns].groupby(groupby_columns).mean())
print('\n',groupby_columns, '별 duration 최소값')
print(df.loc[:, ['duration']+groupby_columns].groupby(groupby_columns).min())
else:
print('\n평균 duration')
print(df.mean())
print('\n최소 duration')
print(df.min())
def corr_between_feature_target(df, feature, target):
plt.figure(figsize=(15,10))
heat_table = df.corr(method="kendall")
mask = np.zeros_like(heat_table)
mask[np.triu_indices_from(mask)] = True
heatmap_ax = sns.heatmap(heat_table, annot=True, mask=mask, cmap='coolwarm')
heatmap_ax.set_xticklabels(heatmap_ax.get_xticklabels(), fontsize=15, rotation=45)
heatmap_ax.set_yticklabels(heatmap_ax.get_yticklabels(), fontsize=15)
plt.title('correlation between features', fontsize=20)
plt.show()
'''
numeric_feature = ['conflict_rate']
plt.figure(figsize=(20,15))
plt.suptitle("Pearson Scatter Plot", fontsize=15)
for i in range(len(numeric_feature)):
plt.subplot(2,3,i+1)
plt.xlabel(numeric_feature[i])
plt.ylabel(target)
corr_score = df[[numeric_feature[i], target]].corr().iloc[0,1].round(2)
c = 'red' if corr_score > 0 else 'blue'
plt.scatter(df[numeric_feature[i]], df[target], color=c, label=f"corr = {corr_score}")
plt.legend(fontsize=15)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
categorical_feature = [col for col in feature if col not in ['conflict_rate']]
print(categorical_feature)
plt.figure(figsize=(20,15))
plt.suptitle("Violin Plot", fontsize=15)
for i in range(len(categorical_feature)):
print(i+1, categorical_feature[i])
plt.subplot(3,2,i+1)
plt.xlabel(categorical_feature[i])
plt.ylabel(target)
sns.violinplot(x= df[categorical_feature[i]], y= df[target])
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
'''
path = os.getcwd()+'./result'
validateDir(path)
df = pd.DataFrame(columns=['lock', 'vus', 'tickets', 'minBackoff', 'maxBackoff', 'retry', 'waitTime', 'leaseTime', 'duration', 'passes', 'fails'])
subdir_names=os.listdir(path)
for file_name in subdir_names:
df1 = pd.read_csv(path+"\\"+file_name, index_col = 0)
df= pd.concat([df, df1])
convert_to_numeric(df)
df['pass_rate'] = df['passes'] /np.minimum(df['vus'], df['tickets'])
df['conflict_rate'] = df['vus'] / df['tickets']
df = df.round(2)
수치형 컬럼 : ['duration', 'passes', 'fails'] 카테고리형 컬럼 : ['lock', 'vus', 'tickets', 'minBackoff', 'maxBackoff', 'retry', 'waitTime', 'leaseTime'] 숫자값을 수치형 컬럼으로 변환합니다. lock object vus int64 tickets int64 minBackoff int64 maxBackoff int64 retry int64 waitTime int64 leaseTime int64 duration float64 passes float64 fails float64 dtype: object
print(df.loc[df.pass_rate < 1, ['lock','vus','tickets','passes', 'pass_rate']])
df = df.drop(['passes', 'fails'], axis=1)
lock vus tickets passes pass_rate 27 d-lock 30 1000 29.67 0.99 64 d-lock 50 100 49.00 0.98 65 d-lock 50 100 48.33 0.97
pass_rate = 0.98
common_columns = ['vus', 'tickets', 'duration', 'conflict_rate']
olock_columns = ['minBackoff', 'maxBackoff','retry']
plock_columns = []
dlock_columns =['waitTime', 'leaseTime']
o_df = df.loc[(df.lock == 'o-lock') & (df['pass_rate'] >= pass_rate), common_columns + olock_columns]
p_df = df.loc[df.lock == 'p-lock', common_columns + plock_columns]
d_df = df.loc[df.lock == 'd-lock', common_columns + dlock_columns]
print(o_df)
vus tickets duration conflict_rate minBackoff maxBackoff retry 81 10 1000 145.33 0.01 200 300 30 82 10 1000 142.30 0.01 200 300 60 83 10 1000 143.21 0.01 200 300 90 84 10 1000 153.64 0.01 200 400 30 85 10 1000 145.37 0.01 200 400 60 .. ... ... ... ... ... ... ... 157 50 10 193.27 5.00 200 400 60 158 50 10 157.20 5.00 200 400 90 159 50 10 152.01 5.00 200 500 30 160 50 10 159.37 5.00 200 500 60 161 50 10 201.63 5.00 200 500 90 [81 rows x 7 columns]
# print(o_df.groupby(['maxBackoff', 'retry']).mean())
show_mean_duration_for(o_df,max_conflict_rate=1, groupby_columns=olock_columns)
# show_mean_duration_for(o_df, max_vus=50, groupby_columns=olock_columns)
feature = [col for col in o_df.columns if col not in ['duration']]
target = 'duration'
corr_between_feature_target(o_df, feature, target)
(min_vus : 0, max_vus : 10000000, min_conflict_rate : 0.00, max_conflict_rate : 1.00) 조건에서의 통계 분석 duration 순으로 상위 5개 vus tickets duration conflict_rate minBackoff maxBackoff retry 96 10 100 40.12 0.1 200 500 30 95 10 100 42.88 0.1 200 400 90 98 10 100 48.18 0.1 200 500 90 90 10 100 50.02 0.1 200 300 30 97 10 100 50.19 0.1 200 500 60 ['minBackoff', 'maxBackoff', 'retry'] 별 duration 평균값 duration minBackoff maxBackoff retry 200 300 30 102.675714 60 103.818571 90 103.302857 400 30 105.884286 60 109.222857 90 111.395714 500 30 104.068571 60 106.268571 90 112.061429 ['minBackoff', 'maxBackoff', 'retry'] 별 duration 최소값 duration minBackoff maxBackoff retry 200 300 30 50.02 60 68.41 90 55.85 400 30 59.81 60 57.01 90 42.88 500 30 40.12 60 50.19 90 48.18
print(p_df)
vus tickets duration conflict_rate 162 10 1000 234.65 0.01 163 10 100 90.02 0.10 164 10 10 72.14 1.00 165 30 1000 218.05 0.03 166 30 100 183.79 0.30 167 30 10 127.14 3.00 168 50 1000 313.14 0.05 169 50 100 289.23 0.50 170 50 10 156.00 5.00
# print(p_df.mean())
show_mean_duration_for(p_df,min_conflict_rate=1, groupby_columns=plock_columns)
# show_mean_duration_for(p_df, max_vus=50, groupby_columns=plock_columns)
feature = [col for col in p_df.columns if col not in ['duration']]
target = 'duration'
corr_between_feature_target(p_df, feature, target)
(min_vus : 0, max_vus : 10000000, min_conflict_rate : 1.00, max_conflict_rate : 10000000.00) 조건에서의 통계 분석 duration 순으로 상위 5개 vus tickets duration conflict_rate 164 10 10 72.14 1.0 167 30 10 127.14 3.0 170 50 10 156.00 5.0 평균 duration vus 30.000000 tickets 10.000000 duration 118.426667 conflict_rate 3.000000 dtype: float64 최소 duration vus 10.00 tickets 10.00 duration 72.14 conflict_rate 1.00 dtype: float64
print(d_df)
vus tickets duration conflict_rate waitTime leaseTime 0 10 1000 276.67 0.01 300 1000 1 10 1000 274.56 0.01 300 1500 2 10 1000 274.80 0.01 300 500 3 10 1000 271.58 0.01 500 1000 4 10 1000 274.55 0.01 500 1500 .. ... ... ... ... ... ... 76 50 10 398.90 5.00 500 1500 77 50 10 394.82 5.00 500 500 78 50 10 383.79 5.00 700 1000 79 50 10 390.45 5.00 700 1500 80 50 10 400.02 5.00 700 500 [81 rows x 6 columns]
show_mean_duration_for(d_df,min_conflict_rate=1, groupby_columns=dlock_columns)
# show_mean_duration_for(d_df, max_vus=50, groupby_columns=dlock_columns)
feature = [col for col in d_df.columns if col not in ['duration']]
target = 'duration'
corr_between_feature_target(d_df, feature, target)
(min_vus : 0, max_vus : 10000000, min_conflict_rate : 1.00, max_conflict_rate : 10000000.00) 조건에서의 통계 분석 duration 순으로 상위 5개 vus tickets duration conflict_rate waitTime leaseTime 23 10 10 117.91 1.0 500 500 26 10 10 118.56 1.0 700 500 24 10 10 119.14 1.0 700 1000 22 10 10 120.57 1.0 500 1500 25 10 10 121.89 1.0 700 1500 ['waitTime', 'leaseTime'] 별 duration 평균값 duration waitTime leaseTime 300 500 281.843333 1000 272.203333 1500 271.050000 500 500 263.770000 1000 269.293333 1500 267.786667 700 500 271.746667 1000 268.016667 1500 265.320000 ['waitTime', 'leaseTime'] 별 duration 최소값 duration waitTime leaseTime 300 500 159.45 1000 122.19 1500 122.71 500 500 117.91 1000 130.02 1500 120.57 700 500 118.56 1000 119.14 1500 121.89
show_mean_duration_for(df.loc[df.pass_rate >= pass_rate, :], groupby_columns=['lock', 'vus', 'tickets'])
(min_vus : 0, max_vus : 10000000, min_conflict_rate : 0.00, max_conflict_rate : 10000000.00) 조건에서의 통계 분석 duration 순으로 상위 5개 lock vus tickets minBackoff maxBackoff retry waitTime leaseTime \ 96 o-lock 10 100 200 500 30 0 0 95 o-lock 10 100 200 400 90 0 0 98 o-lock 10 100 200 500 90 0 0 90 o-lock 10 100 200 300 30 0 0 97 o-lock 10 100 200 500 60 0 0 duration pass_rate conflict_rate 96 40.12 1.0 0.1 95 42.88 1.0 0.1 98 48.18 1.0 0.1 90 50.02 1.0 0.1 97 50.19 1.0 0.1 ['lock', 'vus', 'tickets'] 별 duration 평균값 duration lock vus tickets d-lock 10 10 125.826667 100 134.270000 1000 271.670000 30 10 292.123333 100 338.795556 1000 358.750000 50 10 392.393333 100 511.880000 1000 540.513333 o-lock 10 10 135.137778 100 53.398889 1000 147.421111 30 10 143.456667 100 83.848889 1000 102.275556 50 10 170.506667 100 104.290000 1000 119.282222 p-lock 10 10 72.140000 100 90.020000 1000 234.650000 30 10 127.140000 100 183.790000 1000 218.050000 50 10 156.000000 100 289.230000 1000 313.140000 ['lock', 'vus', 'tickets'] 별 duration 최소값 duration lock vus tickets d-lock 10 10 117.91 100 130.12 1000 262.67 30 10 278.58 100 313.83 1000 351.98 50 10 382.09 100 487.06 1000 512.25 o-lock 10 10 107.40 100 40.12 1000 142.30 30 10 127.36 100 68.41 1000 98.59 50 10 152.01 100 93.84 1000 107.87 p-lock 10 10 72.14 100 90.02 1000 234.65 30 10 127.14 100 183.79 1000 218.05 50 10 156.00 100 289.23 1000 313.14
show_mean_duration_for(df.loc[df.pass_rate >= pass_rate, :], max_conflict_rate=0.5, groupby_columns=['lock', 'vus', 'tickets'])
(min_vus : 0, max_vus : 10000000, min_conflict_rate : 0.00, max_conflict_rate : 0.50) 조건에서의 통계 분석 duration 순으로 상위 5개 lock vus tickets minBackoff maxBackoff retry waitTime leaseTime \ 96 o-lock 10 100 200 500 30 0 0 95 o-lock 10 100 200 400 90 0 0 98 o-lock 10 100 200 500 90 0 0 90 o-lock 10 100 200 300 30 0 0 97 o-lock 10 100 200 500 60 0 0 duration pass_rate conflict_rate 96 40.12 1.0 0.1 95 42.88 1.0 0.1 98 48.18 1.0 0.1 90 50.02 1.0 0.1 97 50.19 1.0 0.1 ['lock', 'vus', 'tickets'] 별 duration 평균값 duration lock vus tickets d-lock 10 100 134.270000 1000 271.670000 30 100 338.795556 1000 358.750000 50 100 511.880000 1000 540.513333 o-lock 10 100 53.398889 1000 147.421111 30 100 83.848889 1000 102.275556 50 100 104.290000 1000 119.282222 p-lock 10 100 90.020000 1000 234.650000 30 100 183.790000 1000 218.050000 50 100 289.230000 1000 313.140000 ['lock', 'vus', 'tickets'] 별 duration 최소값 duration lock vus tickets d-lock 10 100 130.12 1000 262.67 30 100 313.83 1000 351.98 50 100 487.06 1000 512.25 o-lock 10 100 40.12 1000 142.30 30 100 68.41 1000 98.59 50 100 93.84 1000 107.87 p-lock 10 100 90.02 1000 234.65 30 100 183.79 1000 218.05 50 100 289.23 1000 313.14
show_mean_duration_for(df.loc[df.pass_rate >= pass_rate, :], min_conflict_rate=1, groupby_columns=['lock', 'vus', 'tickets'])
(min_vus : 0, max_vus : 10000000, min_conflict_rate : 1.00, max_conflict_rate : 10000000.00) 조건에서의 통계 분석 duration 순으로 상위 5개 lock vus tickets minBackoff maxBackoff retry waitTime leaseTime \ 164 p-lock 10 10 0 0 0 0 0 99 o-lock 10 10 200 300 30 0 0 23 d-lock 10 10 0 0 0 500 500 101 o-lock 10 10 200 300 90 0 0 26 d-lock 10 10 0 0 0 700 500 duration pass_rate conflict_rate 164 72.14 1.0 1.0 99 107.40 1.0 1.0 23 117.91 1.0 1.0 101 118.50 1.0 1.0 26 118.56 1.0 1.0 ['lock', 'vus', 'tickets'] 별 duration 평균값 duration lock vus tickets d-lock 10 10 125.826667 30 10 292.123333 50 10 392.393333 o-lock 10 10 135.137778 30 10 143.456667 50 10 170.506667 p-lock 10 10 72.140000 30 10 127.140000 50 10 156.000000 ['lock', 'vus', 'tickets'] 별 duration 최소값 duration lock vus tickets d-lock 10 10 117.91 30 10 278.58 50 10 382.09 o-lock 10 10 107.40 30 10 127.36 50 10 152.01 p-lock 10 10 72.14 30 10 127.14 50 10 156.00