V tomto notebooku

  • Sestavení, evaluace a vyhodnocení modelu.
  • Logistická regrese.
In [1]:
import pandas as pd
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import numpy as np

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

%matplotlib notebook
matplotlib.rcParams['figure.figsize'] = (9.5, 5.5)
height has been deprecated.

In [2]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

Načtení dat

In [3]:
match = pd.read_csv('../match_final_without_players.csv')
match.shape
Out[3]:
(3040, 328)
In [4]:
match = match.drop(['at_wins_against_ht_last_1m',
                    'at_draw_against_ht_last_1m',
                    'at_loss_against_ht_last_1m',
                    'at_wins_against_ht_last_2m',
                    'at_draw_against_ht_last_2m',
                    'at_loss_against_ht_last_2m',
                    'ht_wins_against_at_last_2m',
                    'ht_draw_against_at_last_2m',
                    'ht_loss_against_at_last_2m',
                    'goals_last_2m',
                    'goals_avg_last_2m',
                    'Unnamed: 0'
                   ], axis=1)
In [5]:
match = match.dropna().reset_index()
match.shape
Out[5]:
(2244, 317)

Normalizace

In [6]:
match_norm   = match.copy()
match_norm   = match_norm.reset_index()
match_target = match_norm[['match_result', 'home_win', 'home_win_draw', 'goals', 'over_15', 'over_25', 'over_35']]
match_norm   = match_norm.drop(['match_result', 'home_win', 'home_win_draw', 'goals', 'over_15', 'over_25', 'over_35'], 
                                axis=1)
columns      = match_norm.columns

x            = match_norm.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled     = min_max_scaler.fit_transform(x)
match_norm   = pd.DataFrame(x_scaled, columns=columns).drop(['index', 'level_0'], axis=1)
In [7]:
match_norm.shape
Out[7]:
(2244, 309)

Rozdělení dat

In [8]:
X = match_norm.copy()
y = match_target['over_25']
perc = np.percentile(X.index, 80)
X_train = X[X.index < perc]
X_test = X[X.index >= perc]
y_train = y[y.index < perc]
y_test = y[y.index >= perc]

Výběr proměnných

In [9]:
logreg = LogisticRegression(class_weight="balanced")

# RFE (Recursive Feature Elimination)
rfe = RFE(logreg, 47)
rfe = rfe.fit(X_train, y_train)

feature_names = np.array(X_train.columns) # transformed list to array
print(feature_names[rfe.support_])

cols    = feature_names[rfe.support_]
X_train = X_train[cols]
X_test  = X_test[cols]
['BWH' 'at_goals_shooted_last_1m' 'ht_goals_recieve_last_2m'
 'ht_goals_shooted_last_5m' 'at_goals_recieve_last_5m'
 'ht_goals_total_last_1m' 'at_goals_total_last_3m'
 'at_goals_total_avg_last_3m' 'at_goals_total_avg_last_4m'
 'ht_goals_shooted_last_3m_h' 'BbAv>2.5' 'ht_shoots_shooted_last_1m'
 'ht_shoots_shooted_last_2m' 'ht_shoots_recieve_last_4m'
 'ht_shoots_recieve_avg_last_4m' 'at_shoots_recieve_last_2m'
 'at_shoots_shooted_avg_last_5m' 'ht_shoots_shooted_target_last_1m'
 'ht_shoots_shooted_target_last_2m' 'ht_shoots_shooted_target_last_4m'
 'ht_shoots_shooted_target_last_5m' 'ht_shoots_recieve_target_last_3m'
 'ht_shoots_shooted_target_avg_last_2m'
 'ht_shoots_shooted_target_avg_last_4m'
 'ht_shoots_shooted_target_avg_last_5m'
 'ht_shoots_recieve_target_avg_last_3m' 'at_shoots_recieve_target_last_1m'
 'at_shoots_shooted_target_avg_last_2m' 'ht_fouls_victims_last_1m'
 'ht_fouls_victims_avg_last_3m' 'at_fouls_commited_last_1m'
 'at_fouls_commited_last_4m' 'at_fouls_commited_last_5m'
 'at_fouls_victims_last_2m' 'at_fouls_commited_avg_last_4m'
 'at_fouls_commited_avg_last_5m' 'ht_corners_last_1m' 'ht_corners_last_4m'
 'at_corners_avg_last_2m' 'ht_yc_last_1m' 'ht_yc_last_2m'
 'ht_yc_against_last_2m' 'ht_yc_against_last_3m' 'ht_yc_avg_last_3m'
 'ht_yc_avg_last_4m' 'ht_yc_avg_last_5m' 'ht_yc_against_avg_last_5m']

Logistická regrese

cílová proměnná: over_25 (1 pokud je součet branek vyšší než 2.5, 0 jinak)

In [10]:
# Logistic regression model fitting
logreg.fit(X_train, y_train)
Out[10]:
LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
In [11]:
# predicting the test set results and calculating the accuracy
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
Accuracy of logistic regression classifier on test set: 0.55
In [12]:
# cross validation
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = logreg
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring='accuracy')
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))
10-fold cross validation average accuracy: 0.580
In [13]:
sns.set(font_scale=1.6)
mat = confusion_matrix(y_pred, y_test)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('Predikovaná kategorie')
plt.ylabel('Skutečná kategorie');
In [14]:
# vypocet zisku/ztraty
result =  match[match.index >= perc][['BbAv<2.5', 'BbAv>2.5']].copy()
result['true'] = y_test
result['pred'] = y_pred
# funkce, ktera vraci sazkovy kurz dle vyhry/prohry
def f(row):
    if (row['true'] == row['pred']) & (row['pred']==0):
        val = row['BbAv<2.5']-1
    elif (row['true'] == row['pred']) & (row['pred']==1):
        val = row['BbAv>2.5']-1
    else:
        val = -1
    return val

result['cash'] = result.apply(f, axis=1)

# soucet ocistenych sazkovych kurzu
result['cash'].sum()
Out[14]:
-12.500000000000009
In [15]:
# ROI
result['cash'].sum()/result.shape[0]
Out[15]:
-0.027839643652561266
In [16]:
# cash pouze na predikcích <2.5
print(result[result.pred==0]['cash'].sum())
# cash pouze na predikcích >2.5
print(result[result.pred==1]['cash'].sum())
-13.239999999999998
0.7400000000000009
In [17]:
result.sort_index(inplace=True)
#result = result.reset_index()
cash_time = np.cumsum(result['cash'])
plt.plot(cash_time)
plt.xlabel('Čas')
plt.ylabel('Zisk')
Out[17]:
Text(0,0.5,'Zisk')
In [18]:
result.head()
Out[18]:
BbAv<2.5 BbAv>2.5 true pred cash
1795 1.76 2.05 1 1 1.05
1796 1.64 2.24 0 0 0.64
1797 1.63 2.25 0 0 0.63
1798 1.56 2.40 1 0 -1.00
1799 1.89 1.91 0 0 0.89

Uložení výsledku do csv

In [19]:
result[['BbAv<2.5', 'BbAv>2.5', 'true', 'pred']].to_csv('pred_logreg.csv')

Závěr

  • Byl sestaven model logistické regrese.
  • Při použití modelu byl predikován záporný zisk.
  • Přesnost predikce je nízká.
  • Model není pro predikci výsledku vhodný.