Basic & Econometrics - API examples

3 minute read

# importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
 
# importing dataset from sklearn
from sklearn.datasets import load_boston
boston_data = load_boston()

# initializing dataset
data_ = pd.DataFrame(boston_data.data)

### Top five rows of dataset
data_.head()

	0	1	2	4	5	6	7	8	9	10	11	12
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33

# Adding features names to the dataframe
data_.columns = boston_data.feature_names
data_.head()

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33

# Target feature of Boston Housing data
data_['PRICE'] = boston_data.target

# checking null values
data_.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
PRICE      0
dtype: int64

# checking if values are categorical or not
data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB

# creating feature and target variable 
X = data_.drop(['PRICE'], axis=1)
y = data_['PRICE']
 
# splitting into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)
print("X training shape : ", X_train.shape )
print("X test shape : ", X_test.shape )
print("y training shape :" , y_train.shape )
print("y test shape :", y_test.shape )
 
# creating model
from sklearn.ensemble import RandomForestRegressor
classifier = RandomForestRegressor()
classifier.fit(X_train, y_train)

X training shape :  (404, 13)
X test shape :  (102, 13)
y training shape : (404,)
y test shape : (102,)

RandomForestRegressor()

# Model evaluation for training data
prediction = classifier.predict(X_train)
print("r^2 : ", metrics.r2_score(y_train, prediction))
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_train, prediction))
print("Mean Squared Error: ", metrics.mean_squared_error(y_train, prediction))
print("Root Mean Squared Error : ", np.sqrt(metrics.mean_squared_error(y_train, prediction)))


# Model evaluation for testing data
prediction_test = classifier.predict(X_test)
print("r^2 : ", metrics.r2_score(y_test, prediction_test))
print("Mean Absolute Error : ", metrics.mean_absolute_error(y_test, prediction_test))
print("Mean Squared Error : ", metrics.mean_squared_error(y_test, prediction_test))
print("Root Mean Absolute Error : ", np.sqrt(metrics.mean_squared_error(y_test, prediction_test)))

r^2 :  0.9830647954820073
Mean Absolute Error:  0.8002103960396031
Mean Squared Error:  1.3680492747524753
Root Mean Squared Error :  1.1696363856996221
r^2 :  0.9132857647425381
Mean Absolute Error :  2.2969901960784305
Mean Squared Error :  8.569741499999996
Root Mean Absolute Error :  2.9274120823689986

# saving the model
import pickle
with open('../api-example/model/model.pkl','wb') as file:
    pickle.dump(classifier, file)



# saving the columns
model_columns = list(X.columns)
with open('../api-example/model/model_columns.pkl','wb') as file:
    pickle.dump(model_columns, file)

import json
result = X_test.head(1).to_json(orient="records")

parsed = json.loads(result)

json.dumps(parsed)  

'[{"CRIM": 0.04932, "ZN": 33.0, "INDUS": 2.18, "CHAS": 0.0, "NOX": 0.472, "RM": 6.849, "AGE": 70.3, "DIS": 3.1827, "RAD": 7.0, "TAX": 222.0, "PTRATIO": 18.4, "B": 396.9, "LSTAT": 7.53}]'

Share on

Twitter Facebook LinkedIn

Youngmin Ju

Basic & Econometrics - API examples

Share on

You may also enjoy

A resource list for causality in statistics, data science

Measuring the Effect of a Training Program on Fulfillment Center Associate Performance

A Comprehensive Causal Inference Framework for Business Questions

Big Data Econometrics