import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
def validateDir(path: str):
assert os.path.exists(path) and os.path.isdir(path), '경로에 디렉토리가 존재하지 않습니다.'
def convert_to_numeric(df):
numeric_feature = df.columns[(df.dtypes==int) | (df.dtypes== float)]
categorical_feature = df.columns[df.dtypes=='O']
print("수치형 컬럼 : ", list(numeric_feature))
print("카테고리형 컬럼 : ", list(categorical_feature))
for col in categorical_feature:
if isinstance(df[col][0], int) or (isinstance(df[col][0],str) and df[col][0].isnumeric()):
df[col] = pd.to_numeric(df[col])
print("숫자값을 수치형 컬럼으로 변환합니다.")
print(df.dtypes)
def check_missing_col(dataframe):
missing_col = []
counted_missing_col = 0
for i, col in enumerate(dataframe.columns):
missing_values = sum(dataframe[col].isna())
is_missing = True if missing_values >= 1 else False
if is_missing:
counted_missing_col += 1
print(f'결측치가 있는 컬럼: {col}')
print(f'해당 컬럼의 결측치 수: {missing_values}')
missing_col.append([col, dataframe[col].dtype])
if counted_missing_col == 0:
print('결측치가 존재하지 않습니다')
return missing_col
def show_mean_duration_for(df, min_vus = 0, max_vus = 10000000, min_conflict_rate=0, max_conflict_rate=10000000, groupby_columns = []):
print(f'\n(min_vus : {min_vus}, max_vus : {max_vus}, min_conflict_rate : {min_conflict_rate:.2f}, max_conflict_rate : {max_conflict_rate:.2f}) 조건에서의 통계 분석')
df = df.loc[(df.vus >= min_vus)&(df.vus <= max_vus)&(df.conflict_rate >= min_conflict_rate)&(df.conflict_rate <= max_conflict_rate),:]
print('\nduration 순으로 상위 5개')
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df.sort_values(['duration']).head(5))
if groupby_columns:
print('\n',groupby_columns, '별 duration 평균값')
print(df.loc[:, ['duration']+groupby_columns].groupby(groupby_columns).mean())
print('\n',groupby_columns, '별 duration 최소값')
print(df.loc[:, ['duration']+groupby_columns].groupby(groupby_columns).min())
else:
print('\n평균 duration')
print(df.mean())
print('\n최소 duration')
print(df.min())
def corr_between_feature_target(df, feature, target):
plt.figure(figsize=(15,10))
heat_table = df.corr(method="kendall")
mask = np.zeros_like(heat_table)
mask[np.triu_indices_from(mask)] = True
heatmap_ax = sns.heatmap(heat_table, annot=True, mask=mask, cmap='coolwarm')
heatmap_ax.set_xticklabels(heatmap_ax.get_xticklabels(), fontsize=15, rotation=45)
heatmap_ax.set_yticklabels(heatmap_ax.get_yticklabels(), fontsize=15)
plt.title('correlation between features', fontsize=20)
plt.show()
'''
numeric_feature = ['conflict_rate']
plt.figure(figsize=(20,15))
plt.suptitle("Pearson Scatter Plot", fontsize=15)
for i in range(len(numeric_feature)):
plt.subplot(2,3,i+1)
plt.xlabel(numeric_feature[i])
plt.ylabel(target)
corr_score = df[[numeric_feature[i], target]].corr().iloc[0,1].round(2)
c = 'red' if corr_score > 0 else 'blue'
plt.scatter(df[numeric_feature[i]], df[target], color=c, label=f"corr = {corr_score}")
plt.legend(fontsize=15)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
categorical_feature = [col for col in feature if col not in ['conflict_rate']]
print(categorical_feature)
plt.figure(figsize=(20,15))
plt.suptitle("Violin Plot", fontsize=15)
for i in range(len(categorical_feature)):
print(i+1, categorical_feature[i])
plt.subplot(3,2,i+1)
plt.xlabel(categorical_feature[i])
plt.ylabel(target)
sns.violinplot(x= df[categorical_feature[i]], y= df[target])
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
'''
path = os.getcwd()+'./result'
validateDir(path)
df = pd.DataFrame(columns=['lock', 'vus', 'tickets', 'minBackoff', 'maxBackoff', 'retry', 'waitTime', 'leaseTime', 'duration', 'passes', 'fails'])
subdir_names=os.listdir(path)
for file_name in subdir_names:
df1 = pd.read_csv(path+"\\"+file_name, index_col = 0)
df= pd.concat([df, df1])
convert_to_numeric(df)
df['pass_rate'] = df['passes'] /np.minimum(df['vus'], df['tickets'])
df['conflict_rate'] = df['vus'] / df['tickets']
df = df.round(2)
수치형 컬럼 : ['duration', 'passes', 'fails'] 카테고리형 컬럼 : ['lock', 'vus', 'tickets', 'minBackoff', 'maxBackoff', 'retry', 'waitTime', 'leaseTime'] 숫자값을 수치형 컬럼으로 변환합니다. lock object vus int64 tickets int64 minBackoff int64 maxBackoff int64 retry int64 waitTime int64 leaseTime int64 duration float64 passes float64 fails float64 dtype: object
print(df.loc[df.pass_rate < 1, ['lock','vus','tickets','passes', 'pass_rate']])
df = df.drop(['passes', 'fails'], axis=1)
lock vus tickets passes pass_rate 27 d-lock 30 1000 29.67 0.99 64 d-lock 50 100 49.00 0.98 65 d-lock 50 100 48.33 0.97
pass_rate = 0.98
common_columns = ['vus', 'tickets', 'duration', 'conflict_rate']
olock_columns = ['minBackoff', 'maxBackoff','retry']
plock_columns = []
dlock_columns =['waitTime', 'leaseTime']
o_df = df.loc[(df.lock == 'o-lock') & (df['pass_rate'] >= pass_rate), common_columns + olock_columns]
p_df = df.loc[df.lock == 'p-lock', common_columns + plock_columns]
d_df = df.loc[df.lock == 'd-lock', common_columns + dlock_columns]
print(o_df)
vus tickets duration conflict_rate minBackoff maxBackoff retry 81 10 1000 145.33 0.01 200 300 30 82 10 1000 142.30 0.01 200 300 60 83 10 1000 143.21 0.01 200 300 90 84 10 1000 153.64 0.01 200 400 30 85 10 1000 145.37 0.01 200 400 60 .. ... ... ... ... ... ... ... 157 50 10 193.27 5.00 200 400 60 158 50 10 157.20 5.00 200 400 90 159 50 10 152.01 5.00 200 500 30 160 50 10 159.37 5.00 200 500 60 161 50 10 201.63 5.00 200 500 90 [81 rows x 7 columns]
# print(o_df.groupby(['maxBackoff', 'retry']).mean())
show_mean_duration_for(o_df,max_conflict_rate=1, groupby_columns=olock_columns)
# show_mean_duration_for(o_df, max_vus=50, groupby_columns=olock_columns)
feature = [col for col in o_df.columns if col not in ['duration']]
target = 'duration'
corr_between_feature_target(o_df, feature, target)
(min_vus : 0, max_vus : 10000000, min_conflict_rate : 0.00, max_conflict_rate : 1.00) 조건에서의 통계 분석
duration 순으로 상위 5개
vus tickets duration conflict_rate minBackoff maxBackoff retry
96 10 100 40.12 0.1 200 500 30
95 10 100 42.88 0.1 200 400 90
98 10 100 48.18 0.1 200 500 90
90 10 100 50.02 0.1 200 300 30
97 10 100 50.19 0.1 200 500 60
['minBackoff', 'maxBackoff', 'retry'] 별 duration 평균값
duration
minBackoff maxBackoff retry
200 300 30 102.675714
60 103.818571
90 103.302857
400 30 105.884286
60 109.222857
90 111.395714
500 30 104.068571
60 106.268571
90 112.061429
['minBackoff', 'maxBackoff', 'retry'] 별 duration 최소값
duration
minBackoff maxBackoff retry
200 300 30 50.02
60 68.41
90 55.85
400 30 59.81
60 57.01
90 42.88
500 30 40.12
60 50.19
90 48.18
print(p_df)
vus tickets duration conflict_rate 162 10 1000 234.65 0.01 163 10 100 90.02 0.10 164 10 10 72.14 1.00 165 30 1000 218.05 0.03 166 30 100 183.79 0.30 167 30 10 127.14 3.00 168 50 1000 313.14 0.05 169 50 100 289.23 0.50 170 50 10 156.00 5.00
# print(p_df.mean())
show_mean_duration_for(p_df,min_conflict_rate=1, groupby_columns=plock_columns)
# show_mean_duration_for(p_df, max_vus=50, groupby_columns=plock_columns)
feature = [col for col in p_df.columns if col not in ['duration']]
target = 'duration'
corr_between_feature_target(p_df, feature, target)
(min_vus : 0, max_vus : 10000000, min_conflict_rate : 1.00, max_conflict_rate : 10000000.00) 조건에서의 통계 분석
duration 순으로 상위 5개
vus tickets duration conflict_rate
164 10 10 72.14 1.0
167 30 10 127.14 3.0
170 50 10 156.00 5.0
평균 duration
vus 30.000000
tickets 10.000000
duration 118.426667
conflict_rate 3.000000
dtype: float64
최소 duration
vus 10.00
tickets 10.00
duration 72.14
conflict_rate 1.00
dtype: float64
print(d_df)
vus tickets duration conflict_rate waitTime leaseTime 0 10 1000 276.67 0.01 300 1000 1 10 1000 274.56 0.01 300 1500 2 10 1000 274.80 0.01 300 500 3 10 1000 271.58 0.01 500 1000 4 10 1000 274.55 0.01 500 1500 .. ... ... ... ... ... ... 76 50 10 398.90 5.00 500 1500 77 50 10 394.82 5.00 500 500 78 50 10 383.79 5.00 700 1000 79 50 10 390.45 5.00 700 1500 80 50 10 400.02 5.00 700 500 [81 rows x 6 columns]
show_mean_duration_for(d_df,min_conflict_rate=1, groupby_columns=dlock_columns)
# show_mean_duration_for(d_df, max_vus=50, groupby_columns=dlock_columns)
feature = [col for col in d_df.columns if col not in ['duration']]
target = 'duration'
corr_between_feature_target(d_df, feature, target)
(min_vus : 0, max_vus : 10000000, min_conflict_rate : 1.00, max_conflict_rate : 10000000.00) 조건에서의 통계 분석
duration 순으로 상위 5개
vus tickets duration conflict_rate waitTime leaseTime
23 10 10 117.91 1.0 500 500
26 10 10 118.56 1.0 700 500
24 10 10 119.14 1.0 700 1000
22 10 10 120.57 1.0 500 1500
25 10 10 121.89 1.0 700 1500
['waitTime', 'leaseTime'] 별 duration 평균값
duration
waitTime leaseTime
300 500 281.843333
1000 272.203333
1500 271.050000
500 500 263.770000
1000 269.293333
1500 267.786667
700 500 271.746667
1000 268.016667
1500 265.320000
['waitTime', 'leaseTime'] 별 duration 최소값
duration
waitTime leaseTime
300 500 159.45
1000 122.19
1500 122.71
500 500 117.91
1000 130.02
1500 120.57
700 500 118.56
1000 119.14
1500 121.89
show_mean_duration_for(df.loc[df.pass_rate >= pass_rate, :], groupby_columns=['lock', 'vus', 'tickets'])
(min_vus : 0, max_vus : 10000000, min_conflict_rate : 0.00, max_conflict_rate : 10000000.00) 조건에서의 통계 분석
duration 순으로 상위 5개
lock vus tickets minBackoff maxBackoff retry waitTime leaseTime \
96 o-lock 10 100 200 500 30 0 0
95 o-lock 10 100 200 400 90 0 0
98 o-lock 10 100 200 500 90 0 0
90 o-lock 10 100 200 300 30 0 0
97 o-lock 10 100 200 500 60 0 0
duration pass_rate conflict_rate
96 40.12 1.0 0.1
95 42.88 1.0 0.1
98 48.18 1.0 0.1
90 50.02 1.0 0.1
97 50.19 1.0 0.1
['lock', 'vus', 'tickets'] 별 duration 평균값
duration
lock vus tickets
d-lock 10 10 125.826667
100 134.270000
1000 271.670000
30 10 292.123333
100 338.795556
1000 358.750000
50 10 392.393333
100 511.880000
1000 540.513333
o-lock 10 10 135.137778
100 53.398889
1000 147.421111
30 10 143.456667
100 83.848889
1000 102.275556
50 10 170.506667
100 104.290000
1000 119.282222
p-lock 10 10 72.140000
100 90.020000
1000 234.650000
30 10 127.140000
100 183.790000
1000 218.050000
50 10 156.000000
100 289.230000
1000 313.140000
['lock', 'vus', 'tickets'] 별 duration 최소값
duration
lock vus tickets
d-lock 10 10 117.91
100 130.12
1000 262.67
30 10 278.58
100 313.83
1000 351.98
50 10 382.09
100 487.06
1000 512.25
o-lock 10 10 107.40
100 40.12
1000 142.30
30 10 127.36
100 68.41
1000 98.59
50 10 152.01
100 93.84
1000 107.87
p-lock 10 10 72.14
100 90.02
1000 234.65
30 10 127.14
100 183.79
1000 218.05
50 10 156.00
100 289.23
1000 313.14
show_mean_duration_for(df.loc[df.pass_rate >= pass_rate, :], max_conflict_rate=0.5, groupby_columns=['lock', 'vus', 'tickets'])
(min_vus : 0, max_vus : 10000000, min_conflict_rate : 0.00, max_conflict_rate : 0.50) 조건에서의 통계 분석
duration 순으로 상위 5개
lock vus tickets minBackoff maxBackoff retry waitTime leaseTime \
96 o-lock 10 100 200 500 30 0 0
95 o-lock 10 100 200 400 90 0 0
98 o-lock 10 100 200 500 90 0 0
90 o-lock 10 100 200 300 30 0 0
97 o-lock 10 100 200 500 60 0 0
duration pass_rate conflict_rate
96 40.12 1.0 0.1
95 42.88 1.0 0.1
98 48.18 1.0 0.1
90 50.02 1.0 0.1
97 50.19 1.0 0.1
['lock', 'vus', 'tickets'] 별 duration 평균값
duration
lock vus tickets
d-lock 10 100 134.270000
1000 271.670000
30 100 338.795556
1000 358.750000
50 100 511.880000
1000 540.513333
o-lock 10 100 53.398889
1000 147.421111
30 100 83.848889
1000 102.275556
50 100 104.290000
1000 119.282222
p-lock 10 100 90.020000
1000 234.650000
30 100 183.790000
1000 218.050000
50 100 289.230000
1000 313.140000
['lock', 'vus', 'tickets'] 별 duration 최소값
duration
lock vus tickets
d-lock 10 100 130.12
1000 262.67
30 100 313.83
1000 351.98
50 100 487.06
1000 512.25
o-lock 10 100 40.12
1000 142.30
30 100 68.41
1000 98.59
50 100 93.84
1000 107.87
p-lock 10 100 90.02
1000 234.65
30 100 183.79
1000 218.05
50 100 289.23
1000 313.14
show_mean_duration_for(df.loc[df.pass_rate >= pass_rate, :], min_conflict_rate=1, groupby_columns=['lock', 'vus', 'tickets'])
(min_vus : 0, max_vus : 10000000, min_conflict_rate : 1.00, max_conflict_rate : 10000000.00) 조건에서의 통계 분석
duration 순으로 상위 5개
lock vus tickets minBackoff maxBackoff retry waitTime leaseTime \
164 p-lock 10 10 0 0 0 0 0
99 o-lock 10 10 200 300 30 0 0
23 d-lock 10 10 0 0 0 500 500
101 o-lock 10 10 200 300 90 0 0
26 d-lock 10 10 0 0 0 700 500
duration pass_rate conflict_rate
164 72.14 1.0 1.0
99 107.40 1.0 1.0
23 117.91 1.0 1.0
101 118.50 1.0 1.0
26 118.56 1.0 1.0
['lock', 'vus', 'tickets'] 별 duration 평균값
duration
lock vus tickets
d-lock 10 10 125.826667
30 10 292.123333
50 10 392.393333
o-lock 10 10 135.137778
30 10 143.456667
50 10 170.506667
p-lock 10 10 72.140000
30 10 127.140000
50 10 156.000000
['lock', 'vus', 'tickets'] 별 duration 최소값
duration
lock vus tickets
d-lock 10 10 117.91
30 10 278.58
50 10 382.09
o-lock 10 10 107.40
30 10 127.36
50 10 152.01
p-lock 10 10 72.14
30 10 127.14
50 10 156.00