# For this analysis, we will need the following libraries and modules
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()
# Load the data from a .csv in the same folder
raw_data = pd.read_csv('1.04. Real-life example.csv')
# Explore the top 5 rows of the df
raw_data.head()
We would like to predict the price of a used car depending on its specifications. Potential regressors in this data are:
1) Brand: A BMW is generally more expensive than a Toyota.
2) Mileage: The more a car is driven, the cheaper it should be.
3) Engine Volume: Sports car have larger engines than economy cars.
4) Year of Production: The older the car, the cheaper it is, with the exception of vintage vehicles.
The rest are categorical variables which we will deal with on a case-by-case basis.
# By default, only descriptives for the numerical variables would be shown
# To include the categorical ones, we should specify this with an argument
raw_data.describe(include='all')
# Drop 'Registration' column from dataset
#data = raw_data.drop(['Registration'],axis=1)
#data.describe(include='all')
data = raw_data
# data.isnull() # shows a df with the information whether a data point is null
# Since True = the data point is missing, while False = the data point is not missing, we can sum them
# This will give us the total number of missing values feature-wise
data.isnull().sum()
# Drop all missing values
# This is not always recommended, however, when we remove less than 5% of the data, it is okay
data_no_mv = data.dropna(axis=0)
# Check the descriptives without the missing values
data_no_mv.describe(include='all')
Notice the min and max values with respect to the mean and the quartiles for each variable. Namely, the max values for Price, Mileage, and EngineV are very high, and the min value for Year is very low compared to the rest of the values.
sns.distplot(data_no_mv['Price'])
# Declare a variable that will be equal to the 98th percentile of the 'Price' variable
q = data_no_mv['Price'].quantile(0.98)
# Then we can create a new df, with the condition that all prices must be below the 98 percentile of 'Price'
data_1 = data_no_mv[data_no_mv['Price']<q]
# In this way we have essentially removed the top 2% of the data about 'Price'
data_1.describe(include='all')
# We can check the PDF once again to ensure that the result is still distributed in the same way overall
sns.distplot(data_1['Price'])
# We can treat the other numerical variables in a similar way
sns.distplot(data_no_mv['Mileage'])
q = data_1['Mileage'].quantile(0.99)
data_2 = data_1[data_1['Mileage']<q]
sns.distplot(data_2['Mileage'])
sns.distplot(data_no_mv['EngineV'])
data_3 = data_2[data_2['EngineV']<6.5]
sns.distplot(data_3['EngineV'])
sns.distplot(data_no_mv['Year'])
# Remove outliers because vintage cars are not representative of our model
q = data_3['Year'].quantile(0.01)
data_4 = data_3[data_3['Year']>q]
sns.distplot(data_4['Year'])
# Reset the index and drop the column containing the old index
data_cleaned = data_4.reset_index(drop=True)
# Let's see what's left
data_cleaned.describe(include='all')
# We could simply use plt.scatter() for each of Year, EngineV, and Mileage
# But since Price is the 'y' axis of all the plots, it made sense to plot them side-by-side (so we can compare them)
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize =(15,3)) #sharey -> share 'Price' as y
ax1.scatter(data_cleaned['Year'],data_cleaned['Price'])
ax1.set_title('Price and Year')
ax2.scatter(data_cleaned['EngineV'],data_cleaned['Price'])
ax2.set_title('Price and EngineV')
ax3.scatter(data_cleaned['Mileage'],data_cleaned['Price'])
ax3.set_title('Price and Mileage')
plt.show()
sns.distplot(data_cleaned['Price'])
# Transform 'Price' with a log transformation
log_price = np.log(data_cleaned['Price'])
# Then add it to our data frame
data_cleaned['log_price'] = log_price
data_cleaned
# Let's check the three scatters once again
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize =(15,3))
ax1.scatter(data_cleaned['Year'],data_cleaned['log_price'])
ax1.set_title('Log Price and Year')
ax2.scatter(data_cleaned['EngineV'],data_cleaned['log_price'])
ax2.set_title('Log Price and EngineV')
ax3.scatter(data_cleaned['Mileage'],data_cleaned['log_price'])
ax3.set_title('Log Price and Mileage')
plt.show()
# Since we will be using the log price variable, we can drop the old 'Price' one
data_cleaned = data_cleaned.drop(['Price'],axis=1)
Now we will deal with the rest of the assumptions:
# Let's quickly see the columns of our data frame
data_cleaned.columns.values
from statsmodels.stats.outliers_influence import variance_inflation_factor
# To make this as easy as possible to use, we declare a variable where we put
# all features where we want to check for multicollinearity
# since our categorical data is not yet preprocessed, we will only take the numerical ones
variables = data_cleaned[['Mileage','Year','EngineV']]
# we create a new data frame which will include all the VIFs
# note that each variable has its own variance inflation factor as this measure is variable specific (not model specific)
vif = pd.DataFrame()
# here we make use of the variance_inflation_factor, which will basically output the respective VIFs
vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
# Finally, I like to include names so it is easier to explore the result
vif["Features"] = variables.columns
# Let's explore the result
vif
data_no_multicollinearity = data_cleaned.drop(['Year'],axis=1)
data_with_dummies = pd.get_dummies(data_no_multicollinearity, drop_first=True)
data_with_dummies.head()
data_with_dummies.columns.values
cols = ['log_price', 'Mileage', 'EngineV', 'Brand_BMW',
'Brand_Mercedes-Benz', 'Brand_Mitsubishi', 'Brand_Renault',
'Brand_Toyota', 'Brand_Volkswagen', 'Body_hatch', 'Body_other',
'Body_sedan', 'Body_vagon', 'Body_van', 'Engine Type_Gas',
'Engine Type_Other', 'Engine Type_Petrol', 'Registration_yes', 'Model_100', 'Model_11', 'Model_116', 'Model_118', 'Model_120',
'Model_19', 'Model_190', 'Model_200', 'Model_210', 'Model_220',
'Model_230', 'Model_25', 'Model_250', 'Model_300', 'Model_316',
'Model_318', 'Model_320', 'Model_323', 'Model_325', 'Model_328',
'Model_330', 'Model_335', 'Model_4 Series Gran Coupe', 'Model_428',
'Model_4Runner', 'Model_5 Series', 'Model_5 Series GT',
'Model_520', 'Model_523', 'Model_524', 'Model_525', 'Model_528',
'Model_530', 'Model_535', 'Model_540', 'Model_545', 'Model_550',
'Model_6 Series Gran Coupe', 'Model_630', 'Model_640', 'Model_645',
'Model_650', 'Model_730', 'Model_735', 'Model_740', 'Model_745',
'Model_750', 'Model_760', 'Model_80', 'Model_9', 'Model_90',
'Model_A 140', 'Model_A 150', 'Model_A 170', 'Model_A 180',
'Model_A1', 'Model_A3', 'Model_A4', 'Model_A4 Allroad', 'Model_A5',
'Model_A6', 'Model_A6 Allroad', 'Model_A7', 'Model_A8',
'Model_ASX', 'Model_Amarok', 'Model_Auris', 'Model_Avalon',
'Model_Avensis', 'Model_Aygo', 'Model_B 170', 'Model_B 180',
'Model_B 200', 'Model_Beetle', 'Model_Bora', 'Model_C-Class',
'Model_CL 180', 'Model_CL 500', 'Model_CL 55 AMG', 'Model_CL 550',
'Model_CL 63 AMG', 'Model_CLA 200', 'Model_CLA 220',
'Model_CLA-Class', 'Model_CLC 180', 'Model_CLC 200',
'Model_CLK 200', 'Model_CLK 220', 'Model_CLK 230', 'Model_CLK 240',
'Model_CLK 280', 'Model_CLK 320', 'Model_CLK 430', 'Model_CLS 350',
'Model_CLS 400', 'Model_CLS 500', 'Model_CLS 63 AMG',
'Model_Caddy', 'Model_Camry', 'Model_Captur', 'Model_Caravelle',
'Model_Carina', 'Model_Carisma', 'Model_Celica', 'Model_Clio',
'Model_Colt', 'Model_Corolla', 'Model_Corolla Verso',
'Model_Cross Touran', 'Model_Dokker', 'Model_Duster',
'Model_E-Class', 'Model_Eclipse', 'Model_Eos', 'Model_Espace',
'Model_FJ Cruiser', 'Model_Fluence', 'Model_Fortuner',
'Model_G 320', 'Model_G 500', 'Model_G 55 AMG', 'Model_G 63 AMG',
'Model_GL 320', 'Model_GL 350', 'Model_GL 420', 'Model_GL 450',
'Model_GL 500', 'Model_GL 550', 'Model_GLC-Class',
'Model_GLE-Class', 'Model_GLK 220', 'Model_GLK 300',
'Model_GLS 350', 'Model_Galant', 'Model_Golf GTI', 'Model_Golf II',
'Model_Golf III', 'Model_Golf IV', 'Model_Golf Plus',
'Model_Golf V', 'Model_Golf VI', 'Model_Golf VII',
'Model_Golf Variant', 'Model_Grand Scenic', 'Model_Grandis',
'Model_Hiace', 'Model_Highlander', 'Model_Hilux', 'Model_I3',
'Model_IQ', 'Model_Jetta', 'Model_Kangoo', 'Model_Koleos',
'Model_L 200', 'Model_LT', 'Model_Laguna', 'Model_Lancer',
'Model_Lancer Evolution', 'Model_Lancer X',
'Model_Lancer X Sportback', 'Model_Land Cruiser 100',
'Model_Land Cruiser 105', 'Model_Land Cruiser 200',
'Model_Land Cruiser 76', 'Model_Land Cruiser 80',
'Model_Land Cruiser Prado', 'Model_Latitude', 'Model_Logan',
'Model_Lupo', 'Model_M5', 'Model_M6', 'Model_MB', 'Model_ML 250',
'Model_ML 270', 'Model_ML 280', 'Model_ML 320', 'Model_ML 350',
'Model_ML 400', 'Model_ML 430', 'Model_ML 500', 'Model_ML 550',
'Model_ML 63 AMG', 'Model_Master', 'Model_Matrix', 'Model_Megane',
'Model_Modus', 'Model_Multivan', 'Model_New Beetle',
'Model_Outlander', 'Model_Outlander XL', 'Model_Pajero',
'Model_Pajero Pinin', 'Model_Pajero Sport', 'Model_Pajero Wagon',
'Model_Passat B3', 'Model_Passat B4', 'Model_Passat B5',
'Model_Passat B6', 'Model_Passat B7', 'Model_Passat B8',
'Model_Passat CC', 'Model_Phaeton', 'Model_Pointer', 'Model_Polo',
'Model_Previa', 'Model_Prius', 'Model_Q3', 'Model_Q5', 'Model_Q7',
'Model_R 320', 'Model_R8', 'Model_Rav 4', 'Model_S 140',
'Model_S 250', 'Model_S 300', 'Model_S 320', 'Model_S 350',
'Model_S 400', 'Model_S 430', 'Model_S 500', 'Model_S 550',
'Model_S 600', 'Model_S 63 AMG', 'Model_S 65 AMG', 'Model_S4',
'Model_S5', 'Model_S8', 'Model_SL 500 (550)', 'Model_SL 55 AMG',
'Model_SLK 200', 'Model_SLK 350', 'Model_Sandero',
'Model_Sandero StepWay', 'Model_Scenic', 'Model_Scion',
'Model_Scirocco', 'Model_Sequoia', 'Model_Sharan', 'Model_Sienna',
'Model_Smart', 'Model_Space Star', 'Model_Space Wagon',
'Model_Sprinter 208', 'Model_Sprinter 210', 'Model_Sprinter 211',
'Model_Sprinter 212', 'Model_Sprinter 213', 'Model_Sprinter 311',
'Model_Sprinter 312', 'Model_Sprinter 313', 'Model_Sprinter 315',
'Model_Sprinter 316', 'Model_Sprinter 318', 'Model_Sprinter 319',
'Model_Symbol', 'Model_Syncro', 'Model_T3 (Transporter)',
'Model_T4 (Transporter)', 'Model_T4 (Transporter) ',
'Model_T5 (Transporter)', 'Model_T5 (Transporter) ',
'Model_T6 (Transporter)', 'Model_T6 (Transporter) ', 'Model_TT',
'Model_Tacoma', 'Model_Tiguan', 'Model_Touareg', 'Model_Touran',
'Model_Trafic', 'Model_Tundra', 'Model_Up', 'Model_V 250',
'Model_Vaneo', 'Model_Vento', 'Model_Venza', 'Model_Viano',
'Model_Virage', 'Model_Vista', 'Model_Vito', 'Model_X1',
'Model_X3', 'Model_X5', 'Model_X5 M', 'Model_X6', 'Model_X6 M',
'Model_Yaris', 'Model_Z3', 'Model_Z4']
# To implement the reordering, we will create a new df, which is equal to the old one but with the new order of features
data_preprocessed = data_with_dummies[cols]
data_preprocessed.head()
# The target(s) (dependent variable) is 'log price'
targets = data_preprocessed['log_price']
# The inputs are everything BUT the dependent variable, so we can simply drop it
inputs = data_preprocessed.drop(['log_price'],axis=1)
# Import the scaling module
from sklearn.preprocessing import StandardScaler
# Create a scaler object
scaler = StandardScaler()
# Fit the inputs (calculate the mean and standard deviation feature-wise)
scaler.fit(inputs)
# Scale the features and store them in a new variable (the actual scaling procedure)
inputs_scaled = scaler.transform(inputs)
# Import the module for the split
from sklearn.model_selection import train_test_split
# Split the variables with an 80-20 split and some random state
# To have the same split each time, use random_state = 365
x_train, x_test, y_train, y_test = train_test_split(inputs_scaled, targets, test_size=0.2, random_state=365)
# Create a linear regression object
reg = LinearRegression()
# Fit the regression with the scaled TRAIN inputs and targets
reg.fit(x_train,y_train)
# Let's check the outputs of the regression
# I'll store them in y_hat as this is the 'theoretical' name of the predictions
y_hat = reg.predict(x_train)
# The simplest way to compare the targets (y_train) and the predictions (y_hat) is to plot them on a scatter plot
# The closer the points to the 45-degree line, the better the prediction
plt.scatter(y_train, y_hat)
# Let's also name the axes
plt.xlabel('Targets (y_train)',size=18)
plt.ylabel('Predictions (y_hat)',size=18)
# We need to make sure the scales of the x-axis and the y-axis are the same
# Otherwise we wouldn't be able to interpret the '45-degree line'
plt.xlim(6,13)
plt.ylim(6,13)
plt.show()
# Another useful check of our model is a residual plot
# Residuals = targets - predictions
# We can plot the PDF of the residuals and check for anomalies
sns.distplot(y_train - y_hat)
# Include a title
plt.title("Residuals PDF", size=18)
# Find the R-squared of the model
r2 = reg.score(x_train,y_train)
r2
# Number of observations is the shape along axis 0
n = x_train.shape[0]
# Number of features (predictors, p) is the shape along axis 1
p = x_train.shape[1]
# We find the Adjusted R-squared using the formula
adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
adjusted_r2
from sklearn.feature_selection import f_regression
f_regression(x_train, y_train)
p_values = f_regression(x_train, y_train)[1].round(3)
p_values
# Obtain the bias (intercept) of the regression
reg.intercept_
# Obtain the weights (coefficients) of the regression
reg.coef_
# Create a regression summary where we can compare them with one-another
reg_summary = pd.DataFrame(inputs.columns.values, columns=['Features'])
reg_summary['Weights'] = reg.coef_
reg_summary['p-values'] = p_values
reg_summary
# Check the different categories in the 'Brand' variable
data_cleaned['Brand'].unique()
# Check the different categories in the 'Body' variable
data_cleaned['Body'].unique()
# Check the different categories in the 'Engine Type
data_cleaned['Engine Type'].unique()
# Find the predicted values using inputs from the test data
y_hat_test = reg.predict(x_test)
# Create a scatter plot with the test targets and the test predictions
# Including the argument 'alpha' introduces proportaional opacity to the graph
# The more saturated the color, the higher the concentration of points (like a heat map
plt.scatter(y_test, y_hat_test, alpha=0.2)
plt.xlabel('Targets (y_test)',size=18)
plt.ylabel('Predictions (y_hat_test)',size=18)
plt.xlim(6,13)
plt.ylim(6,13)
plt.show()
# Finally, let's manually check these predictions
# To obtain the actual prices, we take the exponential of the log_price
df_pf = pd.DataFrame(np.exp(y_hat_test), columns=['Prediction'])
df_pf.head()
# We can also include the test targets in that data frame (so we can manually compare them)
df_pf['Target'] = np.exp(y_test)
df_pf
y_test
y_test = y_test.reset_index(drop=True)
# Check the result
y_test.head()
# Let's overwrite the 'Target' column with the appropriate values
# Again, we need the exponential of the test log price
df_pf['Target'] = np.exp(y_test)
df_pf
df_pf['Residual'] = df_pf['Target'] - df_pf['Prediction']
# Finally, it makes sense to see how far off we are from the result percentage-wise
# Here, we take the absolute difference in %, so we can easily order the data frame
df_pf['Difference%'] = np.absolute(df_pf['Residual']/df_pf['Target']*100)
df_pf
# Exploring the descriptives here gives us additional insights
df_pf.describe()
# Sometimes it is useful to check these outputs manually
# To see all rows, we use the relevant pandas syntax
pd.options.display.max_rows = 999
# Moreover, to make the dataset clear, we can display the result with only 2 digits after the dot
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# Finally, we sort by difference in % and manually check the model
df_pf.sort_values(by=['Difference%'])
How to potentially improve our model further: