import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numba import jit
%matplotlib inline
import seaborn as sns
raw_train = pd.read_csv("data/train.csv")
raw_train = raw_train.drop(raw_train.columns[raw_train.isna().sum() >= 16909], axis=1)
raw_test = pd.read_csv("data/test.csv")
raw_submission = pd.read_csv("data/submission_1002.csv")
# 8760 => 예측 날짜 index 시작지점
@jit(forceobj=True)
def set_raw_test_data(raw_test):
sub_data = {}
date_data = list(pd.date_range(start='7/1/2018', end='11/30/2018', freq='H'))
for dates in date_data:
for key in list(raw_test.columns):
if key == "Time":
sub_data[key] = dates
else:
sub_data[key] = np.nan
raw_test = raw_test.append(sub_data, ignore_index=True)
return raw_test
raw_test = set_raw_test_data(raw_test)
# Make basic dictionary
meter_train = raw_train.columns
meter_dict_train = {}
meter_remove_index_dict = {}
@jit(forceobj=True)
def set_meter_dicts(origin_df, meter_train, meter_dict_train, meter_remove_index_dict):
for meter in meter_train:
meter_dict_train[meter] = pd.DataFrame(origin_df[meter])
meter_dict_train[meter]["time"] = origin_df["Time"]
meter_dict_train[meter] = meter_dict_train[meter].rename(columns={meter:"value"})
meter_dict_train[meter]["id"] = meter
meter_remove_index_dict[meter] = 0
if meter != "Time":
mean_value = meter_dict_train[meter]["value"].mean()
meter_dict_train[meter]["value"] = meter_dict_train[meter]["value"].fillna(mean_value)
if meter_dict_train[meter]["value"].isna().sum() > 0:
break
return meter_dict_train, meter_remove_index_dict
meter_dict_train, meter_remove_index_dict = set_meter_dicts(raw_train, meter_train, meter_dict_train, meter_remove_index_dict)
#Get front nan indices
@jit(forceobj=True)
def get_front_nan_indices(meter_train, meter_dict_train, meter_remove_index_dict):
for meter in meter_train:
for index, value in meter_dict_train[meter]["value"].items():
if pd.isnull(value):
continue
else:
meter_remove_index_dict[meter] = index
break
return meter_train, meter_dict_train, meter_remove_index_dict
meter_train, meter_dict_train, meter_remove_index_dict = get_front_nan_indices(meter_train, meter_dict_train, meter_remove_index_dict)
from sklearn.cluster import DBSCAN
@jit(forceobj=True)
def get_cluster_labels(meter_dict_train):
cluster_train = []
for meter, dataframe in meter_dict_train.items():
if meter != "Time":
trainable_dataframe = dataframe["value"]
cluster_train.append(trainable_dataframe.values)
clustering = DBSCAN(eps=3, min_samples=2, n_jobs=-1).fit(cluster_train)
return clustering.labels_
labels = get_cluster_labels(meter_dict_train)
set(labels)
def get_trainable_dataframe(dataframe, meter, meter_keys, labels=False):
dataframe["time"] = pd.to_datetime(dataframe["time"])
dataframe["year"] = dataframe.time.dt.year
dataframe["month"] = dataframe.time.dt.month
dataframe["day"] = dataframe.time.dt.day
dataframe["hour"] = dataframe.time.dt.hour
dataframe["fft"] = pd.Series(np.fft.fft(dataframe["value"].values) / len(dataframe["value"].values), dtype=np.float64)
if type(labels) != bool:
dataframe["label"] = labels[meter_keys.index(meter) - 1]
else:
dataframe["label"] = -1
return dataframe
# Label classifier train
@jit(forceobj=True)
def get_label_train_data(meter_train, meter_dict_train, labels):
x_concat = []
y_concat = []
for meter in meter_train:
if meter != "Time":
df = get_trainable_dataframe(meter_dict_train[meter], meter, list(meter_dict_train.keys()), labels)
X = df[["value", "year", "month", "day", "hour", "fft", "id"]]
y = df["label"]
x_concat.append(X)
y_concat.append(y)
return x_concat, y_concat
X_label_train, Y_label_train = get_label_train_data(meter_train, meter_dict_train, labels)
X_label_train = pd.concat(X_label_train)
Y_label_train = pd.concat(Y_label_train)
X_label_train.id = X_label_train.id.astype('category').cat.codes
print(X_label_train.shape)
print(Y_label_train.shape)
from sklearn.ensemble import RandomForestClassifier
# labelClassifier = RandomForestClassifier(
# n_estimators = 100,
# n_jobs = -1).fit(X_label_train, Y_label_train)
from sklearn.externals import joblib
labelClassifier = joblib.load("data/labelclf.pickle")
# joblib.dump(labelClassifier, "data/labelclf.pickle")
meter_test = raw_test.columns
meter_dict_test = {}
meter_remove_index_test_dict = {}
meter_dict_test, meter_remove_index_test_dict = set_meter_dicts(raw_test, meter_test, meter_dict_test, meter_remove_index_test_dict)
test = []
for meter, dataframe in meter_dict_test.items():
if meter != "Time":
df = get_trainable_dataframe(dataframe, meter, list(meter_dict_test.keys()), labels=False)
df = df[["value", "year", "month", "day","hour", "fft", "id"]]
test.append(df)
validation_set = pd.concat(test)
id_category = validation_set.id.astype('category')
dict_category = dict(enumerate(id_category.cat.categories))
validation_set.id = validation_set.id.astype('category').cat.codes
validation_set["label"] = pd.Series(labelClassifier.predict(validation_set))
X_label_train["label"] = pd.Series(Y_label_train.values)
Y_label_train = X_label_train["value"]
Y_label_train.head(5)
del X_label_train["value"]
X_label_val = validation_set.loc[:, validation_set.columns != 'value']
Y_label_val = validation_set["value"]
# Sample Training
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
xgb = XGBRegressor(
n_estimator=300,
learning_rate=0.05,
max_depth = 5,
colsample_bytree=0.6,
subsample=0.6,
n_jobs=-1).fit(X_label_train, Y_label_train)
r2_score(Y_label_val, xgb.predict(X_label_val))
joblib.dump(xgb, "data/xgb.pickle")
xgb = joblib.load("data/xgb.pickle")
validation_set["id"] = validation_set["id"].map(dict_category)
validation_set.value = pd.Series(xgb.predict(X_label_val))
validation_set.tail(2)
raw_submission = raw_submission.set_index("meter_id")
raw_submission.head(2)
# for hourly prediction
for meter in list(raw_test.keys()):
if meter != "Time":
tempset = validation_set[(validation_set.id == meter) &
(validation_set.year >= 2018) &
(validation_set.month >= 7)]
raw_submission.loc[meter, :24] = tempset.value[:24].values
# for daily prediction
for meter in list(raw_test.keys()):
if meter != "Time":
pred_day = []
for day in range(1,11):
day_mean = validation_set[(validation_set.id == meter) &
(validation_set.year >= 2018) &
(validation_set.month == 7) &
(validation_set.day == day)].value.sum()
pred_day.append(day_mean)
raw_submission.loc[meter, 'X2018_7_1_d':'X2018_7_10_d'] = pred_day
# for monthly prediction
for meter in list(raw_test.keys()):
if meter != "Time":
pred_month = []
for month in range(7,12):
month_mean = validation_set[(validation_set.id == meter) &
(validation_set.year >= 2018) &
(validation_set.month == month)].value.sum()
pred_month.append(month_mean)
raw_submission.loc[meter, 'X2018_7_m':'X2018_11_m'] = pred_month
raw_submission = raw_submission.reset_index()
raw_submission.to_csv("data/answer.csv", index=False)