import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


arima_data = pd.read_excel("../data/Real_time_forecast_dataset_04_04_20.xlsx")
#Fix range
length=arima_data.count()
for k,i in enumerate(length):
    arima_data[arima_data.columns[k]]=arima_data[arima_data.columns[k]].shift(periods=len(arima_data)-i)
arima_data['date'] = pd.date_range(end='4/4/2020', periods=len(arima_data), freq='D')
arima_data = arima_data.set_index(arima_data['date']).drop(columns='date')

arima_data.plot(
    kind='line', stacked=True, figsize = (15,6)
).set_title('Forecast of COVID-19 cases',
            fontfamily='Tahoma',
            fontsize='x-large',
            fontstyle='italic',
            fontweight ='extra bold',
            fontvariant='small-caps');


actual = pd.read_csv("../data/Realtime_cases_plot.csv")
actual.loc[-1] = ['2020-01-20T00:00:00Z',0, 0, 0, 0, 0] # adding a row
actual.index = actual.index + 1 # shifting index
actual.sort_index(inplace=True)

actual['date'] = pd.date_range(end='4/4/2020', periods=len(actual), freq='D')
actual = actual.set_index(actual['date']).drop(columns='date')

S_Korea = pd.merge(actual["South Korea"], arima_data["S. Korea"], 'left', on='date')
France = pd.merge(actual["France"], arima_data["France"], 'left', on='date')
Canada = pd.merge(actual["Canada"], arima_data["Canada"], 'left', on='date')
India = pd.merge(actual["India"], arima_data["India"], 'left', on='date')
UK = pd.merge(actual["United Kingdom"], arima_data["UK"], 'left', on='date')

%run 'Graph_Fig.py'

fig


#Set colors to default
mpl.rcParams.update(mpl.rcParamsDefault)

%run 'Graph_Flag.py'


flags


summary = covid_data.describe().transpose()
summary['variance']=np.square(summary['std'])
summary = summary.drop(columns=['count', 'std','25%', '50%', '75%'])
summary.round(2)


from sklearn.tree import DecisionTreeRegressor
from sklearn import tree


#We exclude Total Deaths and Climate Zones as in the paper
X = covid_data.drop(columns=['CFR', 'Total deaths', 'Climate zones'])
y = covid_data['CFR']

from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

# We equal the parameters as the control parameters of the corresponfing R function 'rpart' used in the paper
model = tree.DecisionTreeRegressor(criterion= "mse", # $method='anova'
                                   min_samples_split = 5, # $minsplit = 5
                                   max_depth=30, # $maxdepth
                                   min_samples_leaf=2) #$minbucket
model.fit(X,y);


plt.figure(figsize=(100,70))
features = X.columns.str.title()
tree.plot_tree(model,fontsize=40, feature_names=features,
               filled=True, node_ids=False, rounded=True)
plt.show()


from dtreeviz.trees import dtreeviz
viz = dtreeviz(model, X, y,
               target_name="CFR", feature_names= features,
               title='Regression Tree', fontname="Arial", title_fontsize=20,
               scale=0.8, orientation="LR", show_node_labels = False,
               colors={'title':'black', 'text':'#14213d', 'arrow':'#455e89',
                       'scatter_marker':'#a01a58','tick_label':'grey','split_line':'#CED4DA'})
viz


(pd.Series(model.feature_importances_,
           index= X.columns.str.title())
   .nsmallest(10) #To plot the 5 most important variables
   .plot(kind='barh',
         title = 'Variable Importance',
         figsize = [12,6],
         table = False,
         fontsize = 13,
         color = '#2e6f95',
         align='edge', width=0.8
         ));


world = pd.read_csv("https://raw.githubusercontent.com/dbouquin/IS_608/master/NanosatDB_munging/Countries-Continents.csv")
world.replace('US', 'USA', inplace=True)
world.replace('United Kingdom', 'UK', inplace=True)
world.replace('CZ', 'Czechia', inplace=True)
world.replace('Russian Federation', 'Russia', inplace=True)
world.replace('Korea, South', 'S. Korea', inplace=True)

final = pd.merge(covid_data, world, 'left', on='Country')

final1 = final.groupby('Continent')
Africa = final1.get_group('Africa') #3 observations
Asia = final1.get_group('Asia') #13 observations
Europe = final1.get_group('Europe') #23 observation
N_America = final1.get_group('North America') #5 observation
S_America = final1.get_group('South America') #5 observation
Oceania = final1.get_group('Oceania') #1 observation


X3 = Europe.drop(columns=['CFR', 'Continent', 'Country', 'Total deaths', 'Climate zones'])
y3 = Europe['CFR']

#min split 10%, so in this case =2
model3 = tree.DecisionTreeRegressor(criterion= "mse", min_samples_split = 2, max_depth=15, min_samples_leaf=2)
model3.fit(X3,y3)
plt.figure(figsize=(80,50))
features = X3.columns
tree.plot_tree(model3, feature_names=features,filled=True, fontsize=50)
plt.show()


viz2 = dtreeviz(model3, X3, y3,
               target_name="CFR", feature_names= features,
               title='Europe Regression Tree', fontname="Arial", title_fontsize=15,
               scale=1.3, show_node_labels = False, 
               colors={'title':'black', 'text':'#14213d', 'arrow':'#455e89',
                       'scatter_marker':'#a01a58','tick_label':'grey','split_line':'#CED4DA'})
viz2


(pd.Series(model3.feature_importances_,
           index= X3.columns.str.title())
   .nsmallest(10) #To plot the 5 most important variables
   .plot(kind='barh',
         title = 'Variable Importance',
         figsize = [12,6],
         table = False,
         fontsize = 13,
         color = '#4E8d95',
         align='edge', width=0.8));


from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

RMSE = 0.013
MAE = float('nan')
r2 = 0.896
Adj_r2 = 0.769

y_pred = model.predict(X)

RMSE2 = np.round(mean_squared_error(y, y_pred, squared=False), 4) 
MAE2 = np.round(mean_absolute_error(y, y_pred),4)
r2_2 = np.round(r2_score(y, y_pred),2)
Adj_r2_2 = np.round(1 - (1-r2_score(y, y_pred)) * (len(y)-1)/(len(y)-X.shape[1]-1),2)

y_pred3 = model3.predict(X3)

RMSE3 = mean_squared_error(y3, y_pred3, squared=False)
MAE3 = mean_absolute_error(y3, y_pred3)
r2_3 = r2_score(y3, y_pred3)
Adj_r2_3 = 1 - (1-r2_score(y3, y_pred3)) * (len(y3)-1)/(len(y3)-X3.shape[1]-1)

(pd.DataFrame({'RMSE':[RMSE, RMSE2, RMSE3], 'MAE' :[MAE, MAE2, MAE3],
               'R^2': [r2, r2_2, r2_3], 'Adjusted R^2': [Adj_r2, Adj_r2_2, Adj_r2_3]}, 
              index = ['Paper Model Metrics', 'Our Model Metrics', 'EU Model Metrics'])
 .style.set_caption("Models Metrics")
 .set_table_styles([{'selector': 'caption', 'props': 'caption-side: top; font-size:1.8em;'}])
 .format(formatter={('RMSE'): lambda x: "{:,.3f}".format(x), ('MAE'): lambda x: "{:,.3f}".format(x),
                    ('R^2'): lambda x: '{:,.3f}'.format(x), ('Adjusted R^2'): lambda x: '{:,.3f}'.format(x)}))

	Cases	Population	Population Density	People (>65)	No. Of Days Since Shutdown	Time Of Arival (Till Today)	Doctors Per 1000 People	Hospital Beds Per 1000	Income Class	Climate Zones	Cfr	Total Deaths
Country
USA	277.965$K$	329.55$M$	34 /$km^2$	15.4%	16	75	2.57	2.90	1	1	0.026	7157
Italy	119.827$K$	60.25$M$	200 /$km^2$	23.0%	26	65	4.02	3.40	1	1	0.123	14681
Spain	117.710$K$	46.93$M$	93 /$km^2$	19.4%	20	64	3.87	3.10	1	0	0.093	10935
China	82.527$K$	1402.01$M$	145 /$km^2$	10.6%	73	84	1.81	3.80	0	0	0.040	3330
Germany	79.696$K$	83.15$M$	233 /$km^2$	21.5%	21	68	4.19	8.20	1	1	0.013	1017
France	64.338$K$	67.06$M$	123 /$km^2$	19.7%	22	71	3.24	6.40	1	1	0.101	6507
Iran	50.468$K$	83.33$M$	51 /$km^2$	5.4%	46	76	1.49	0.10	0	0	0.063	3160
UK	38.168$K$	66.43$M$	274 /$km^2$	18.5%	19	64	2.83	2.90	1	1	0.094	3605
Turkey	20.921$K$	83.15$M$	106 /$km^2$	8.2%	19	25	1.75	2.50	0	0	0.020	425
Switzerland	19.706$K$	8.58$M$	208 /$km^2$	18.4%	20	39	4.25	5.00	1	1	0.031	607
Belgium	16.770$K$	11.52$M$	376 /$km^2$	18.6%	18	60	3.01	6.50	1	1	0.068	1143
Netherlands	15.723$K$	17.45$M$	420 /$km^2$	18.8%	17	37	3.48	4.70	1	1	0.095	1487
Canada	12.519$K$	31.98$M$	4 /$km^2$	17.0%	20	70	2.54	2.70	1	1	0.015	187
Austria	11.525$K$	8.90$M$	106 /$km^2$	19.2%	20	39	5.23	7.60	1	1	0.015	168
S. Korea	10.156$K$	51.78$M$	517 /$km^2$	13.9%	13	75	2.33	10.30	1	0	0.017	177
Portugal	9.886$K$	10.28$M$	112 /$km^2$	21.5%	17	62	4.43	3.40	1	0	0.025	246
Brazil	9.056$K$	211.33$M$	25 /$km^2$	8.6%	19	39	1.85	2.30	0	0	0.040	359
Israel	7.428$K$	9.18$M$	416 /$km^2$	11.7%	13	43	3.58	3.30	1	-1	0.005	39
Sweden	6.078$K$	10.37$M$	23 /$km^2$	19.9%	11	64	4.19	2.70	1	1	0.055	333
Australia	5.548$K$	25.66$M$	3 /$km^2$	15.5%	13	70	3.50	3.90	1	-1	0.005	30
Norway	5.208$K$	5.37$M$	17 /$km^2$	16.8%	20	38	4.38	3.30	1	1	0.008	44
Ireland	4.273$K$	4.92$M$	70 /$km^2$	13.9%	24	35	2.96	2.90	1	1	0.028	120
Czechia	4.190$K$	10.68$M$	135 /$km^2$	19.0%	24	33	3.68	6.80	1	1	0.013	53
Russia	4.149$K$	146.88$M$	9 /$km^2$	14.2%	22	64	3.98	9.70	0	1	0.008	34
Denmark	3.757$K$	5.81$M$	135 /$km^2$	19.7%	22	37	3.65	3.50	1	1	0.037	139
Poland	3.383$K$	38.39$M$	123 /$km^2$	16.8%	21	31	2.29	6.50	1	1	0.021	71
Ecuador	3.368$K$	17.46$M$	63 /$km^2$	7.1%	20	35	1.67	1.60	0	1	0.043	145
Malaysia	3.333$K$	32.74$M$	99 /$km^2$	6.3%	20	70	1.53	1.90	0	0	0.016	53
Romania	3.183$K$	19.40$M$	81 /$km^2$	17.9%	15	38	2.67	6.10	0	1	0.042	133
Philippines	3.018$K$	108.48$M$	362 /$km^2$	4.8%	22	65	1.11	1.00	0	0	0.045	136
Japan	2.935$K$	126.01$M$	333 /$km^2$	27.0%	34	80	2.37	13.70	1	0	0.024	69
India	2.902$K$	1360.49$M$	414 /$km^2$	6.0%	13	65	0.76	0.70	0	0	0.023	68
Luxembourg	2.612$K$	0.61$M$	237 /$km^2$	14.3%	0	35	2.92	5.40	1	1	0.012	31
Pakistan	2.291$K$	219.14$M$	273 /$km^2$	4.5%	15	38	0.98	0.60	0	0	0.014	31
Indonesia	1.986$K$	268.07$M$	141 /$km^2$	5.3%	21	33	0.20	0.90	0	0	0.091	181
Mexico	1.688$K$	126.58$M$	64 /$km^2$	6.9%	16	35	2.23	1.50	0	-1	0.036	60
Panama	1.673$K$	4.16$M$	56 /$km^2$	7.9%	14	25	1.59	2.20	1	0	0.025	41
Finland	1.615$K$	5.52$M$	16 /$km^2$	21.2%	13	66	3.20	5.50	1	1	0.012	20
Greece	1.613$K$	10.72$M$	81 /$km^2$	20.4%	21	38	6.25	4.80	1	0	0.037	59
Peru	1.595$K$	32.16$M$	25 /$km^2$	7.2%	20	29	1.12	1.50	0	0	0.038	61
Dominican Republic	1.488$K$	10.36$M$	216 /$km^2$	7.0%	20	34	1.49	1.70	0	0	0.046	68
Serbia	1.476$K$	6.90$M$	89 /$km^2$	17.4%	17	29	2.46	5.40	0	1	0.026	39
Colombia	1.267$K$	46.22$M$	40 /$km^2$	7.6%	20	28	1.82	1.50	0	0	0.020	25
Argentina	1.265$K$	44.94$M$	16 /$km^2$	11.2%	20	32	3.21	4.70	0	-1	0.029	37
Ukraine	0.987$K$	41.90$M$	69 /$km^2$	16.5%	23	32	3.00	9.00	0	1	0.023	23
Algeria	0.986$K$	43.00$M$	18 /$km^2$	6.2%	17	39	1.21	1.70	0	-1	0.084	83
Egypt	0.779$K$	100.19$M$	100 /$km^2$	5.2%	17	50	0.81	0.50	0	-1	0.067	52
Iraq	0.772$K$	39.31$M$	90 /$km^2$	3.2%	19	42	0.85	1.30	0	-1	0.070	54
Morocco	0.708$K$	35.86$M$	80 /$km^2$	6.8%	22	33	0.62	0.90	0	-1	0.062	44
San Marino	0.251$K$	0.03$M$	568 /$km^2$	16.0%	15	37	6.36	3.80	1	1	0.127	32

	mean	min	max	variance
cases in thousands	20.90	0.25	277.96	2187.92
population (in millions)	110.62	0.03	1402.01	73658.12
population density/km2	149.78	3.00	568.00	20371.56
% People (>65)	13.58	3.20	27.00	38.59
no. of days since shutdown	20.20	0.00	73.00	95.96
time of arival (till today)	48.72	25.00	84.00	309.23
Doctors per 1000 people	2.71	0.20	6.36	1.98
Hospital beds per 1000	3.93	0.10	13.70	8.25
CFR	0.04	0.01	0.13	0.00
Total deaths	1151.98	20.00	14681.00	8205536.59

Table of Contents

Real-time forecasts and risk assessment of novel coronavirus (COVID-19) cases: A data-driven analysis¶

Forecast of COVID-19 cases¶

Risk assessment of COVID-19 cases¶

Data¶

Summary Statistics¶

Method: Regression Tree¶

Paper Model¶

Considerations¶

Different Representation of The Regression Tree¶

Variable Importance Plot¶

RT Europe¶

Different Representation of The Regression Tree¶

Variable Importance Plot¶

Metrics¶

Results and models comparison¶

Limitations of our findings¶

Conclusions¶

	RMSE	MAE	R^2	Adjusted R^2
Paper Model Metrics	0.013	nan	0.896	0.769
Our Model Metrics	0.010	0.007	0.890	0.870
EU Model Metrics	0.015	0.010	0.832	0.715