import warnings
warnings.filterwarnings('ignore')
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
Cement (component 1) -- quantitative -- kg in a m3 mixture -- Input Variable
Blast Furnace Slag (component 2) -- quantitative -- kg in a m3 mixture -- Input Variable
Fly Ash (component 3) -- quantitative -- kg in a m3 mixture -- Input Variable
Water (component 4) -- quantitative -- kg in a m3 mixture -- Input Variable
Superplasticizer (component 5) -- quantitative -- kg in a m3 mixture -- Input Variable
Coarse Aggregate (component 6) -- quantitative -- kg in a m3 mixture -- Input Variable
Fine Aggregate (component 7) -- quantitative -- kg in a m3 mixture -- Input Variable
Age -- quantitative -- Day (1~365) -- Input Variable
Concrete compressive strength -- quantitative -- MPa -- Output Variable
# importing data
df = pd.read_csv("concrete.csv")
df.head() # used to see top 5 rows of the dataset
#five point analysis
df.describe().transpose()
#checking for datatypes
df.info()
#no of rows and columns
df.shape
#checking for null values
df.isnull().values.any()
#checking for any skewed values
df.skew()
#visualizing age skewness using a box plot
sns.boxplot(df['age'])
sns.distplot(df['age'])
# Age value less than 365 is allowed since the age value in this can range (1~365)
sns.distplot(df['fineagg'])
sns.distplot(df['coarseagg'])
sns.distplot(df['superplastic'])
sns.distplot(df['water'])
sns.distplot(df['ash'])
sns.distplot(df['slag'])
sns.distplot(df['cement'])
sns.distplot(df['strength'], kde= True)
sns.pairplot(df)
#From the pariplot as cement increases strength also increase
# visualise area-price relationship
sns.regplot(x="cement", y="strength", data=df, fit_reg=False)
sns.regplot(x="slag", y="strength", data=df, fit_reg=False)
sns.regplot(x="ash", y="strength", data=df, fit_reg=False)
sns.regplot(x="water", y="strength", data=df, fit_reg=False)
sns.regplot(x="superplastic", y="strength", data=df, fit_reg=False)
sns.regplot(x="coarseagg", y="strength", data=df, fit_reg=False)
sns.regplot(x="fineagg", y="strength", data=df, fit_reg=False)
sns.regplot(x="age", y="strength", data=df, fit_reg=False)
#Checking for highly corelaed variables
df.corr()
#no highly correlated independent variables so planning to keep all varibales as it is.
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False)
plt.show()
# Copy all the predictor variables into X dataframe. Since 'strength' is dependent variable drop it
X = df.drop('strength', axis=1)
# Copy the 'strength' column alone into the y dataframe. This is the dependent variable
y = df[['strength']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
#Using polynomial Features to create more independent variables
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2, interaction_only=True)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=1)
#Linear Regression
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
regression_model.score(X_test, y_test)
#Ridge
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
#Lasso ; Just seeing if we can drop any independent variable.
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))
#The test scores are comparable between simple and quadratic models. Scores are coming around 60%. We have to try other algorithms
dtRegressor = DecisionTreeRegressor(random_state=0)
dtRegressor.fit(X_train,y_train)
print(dtRegressor.score(X_train, y_train))
print(dtRegressor.score(X_test, y_test))
#Decsion tree regressor is giving the required scores before applying cross validation
randomForestRegressor = RandomForestRegressor(max_depth=2, random_state=0)
randomForestRegressor.fit(X_train,y_train)
print(randomForestRegressor.score(X_train, y_train))
print(randomForestRegressor.score(X_test, y_test))
from sklearn.ensemble import AdaBoostRegressor
abRegressor = AdaBoostRegressor(random_state=0, n_estimators=100)
abRegressor.fit(X_train,y_train)
print(abRegressor.score(X_train, y_train))
print(abRegressor.score(X_test, y_test))
from sklearn.ensemble import GradientBoostingRegressor
gradientRegressor = GradientBoostingRegressor(random_state=0)
gradientRegressor.fit(X_train,y_train)
print(gradientRegressor.score(X_train, y_train))
print(gradientRegressor.score(X_test, y_test))
dtRegressor.get_params().keys()
#We will try to use RandomSearchCV
samples = 10 # number of random samples
depths = np.arange(1, 20)
num_leafs = [1, 5, 10, 20]
parameters = [{'max_depth':depths,
'min_samples_leaf':num_leafs}]
dtRegressorRSCV = RandomizedSearchCV(dtRegressor, parameters, cv=samples)
dtRegressorRSCV.fit(X, y)
print(dtRegressorRSCV.best_params_)
num_folds = 3
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(dtRegressorRSCV, X, y, cv=kfold, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
#We will try to use GridSearchCV now
samples = 10 # number of random samples
depths = np.arange(1, 20)
num_leafs = [1, 5, 10, 20]
parameters = { 'min_samples_leaf' : num_leafs,
'max_depth' : depths
}
dtRegressorGSCV = GridSearchCV(dtRegressor, param_grid = parameters, scoring='r2', cv=samples)
dtRegressorGSCV.fit(X, y)
print(dtRegressorGSCV.best_params_)
num_folds = 3
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(dtRegressorGSCV, X, y, cv=kfold, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
gradientRegressor.get_params().keys()
#We will try to use RandomSearchCV
samples = 10 # number of random samples
depths = np.arange(1, 20)
num_leafs = [ 20, 30 ]
parameters = [{'max_depth':depths,
'min_samples_leaf':num_leafs}]
gradientRegressorRSCV = RandomizedSearchCV(gradientRegressor, parameters, cv=samples)
gradientRegressorRSCV.fit(X, y)
print(gradientRegressorRSCV.best_params_)
num_folds = 3
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(gradientRegressorRSCV, X, y, cv=kfold, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
#We will try to use GridSearchCV
samples = 10 # number of random samples
depths = np.arange(1, 20)
num_leafs = [ 20 , 30]
parameters = { 'max_depth' : depths,
'min_samples_leaf': num_leafs
}
gradientRegressorGSCV = GridSearchCV(gradientRegressor, param_grid = parameters, scoring='r2', cv=samples)
gradientRegressorGSCV.fit(X, y)
print(gradientRegressorGSCV.best_params_)
num_folds = 3
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
results = cross_val_score(gradientRegressorGSCV, X, y, cv=kfold, scoring='r2')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))