import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib
import seaborn as sns
import statsmodels.api as sm
import pickle 
data=pd.read_csv("/home/gsaksham/Downloads/data.csv")
data.head()
appointmentId inspectionStartTime year month engineTransmission_battery_value engineTransmission_battery_cc_value_0 engineTransmission_battery_cc_value_1 engineTransmission_battery_cc_value_2 engineTransmission_battery_cc_value_3 engineTransmission_battery_cc_value_4 ... engineTransmission_gearShifting_cc_value_1 engineTransmission_gearShifting_cc_value_2 engineTransmission_comments_value_0 engineTransmission_comments_value_1 engineTransmission_comments_value_2 engineTransmission_comments_value_3 engineTransmission_comments_value_4 fuel_type odometer_reading rating_engineTransmission
0 aj_01 2/3/2019 15:43 2008 8 No Weak NaN NaN NaN NaN ... NaN NaN Pickup Low Starter Motor / Solenoid malfunctioning NaN NaN NaN Petrol + CNG 124600 3.0
1 aj_02 1/16/2019 13:02 2007 5 Yes NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN Petrol + CNG 78108 4.0
2 aj_03 2/9/2019 13:31 2012 5 Yes NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN Diesel 93348 3.5
3 aj_04 1/18/2019 11:02 2013 1 Yes NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN Petrol + CNG 48233 3.5
4 aj_05 1/27/2019 12:12 2011 7 Yes NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN Petrol + CNG 74368 4.0

5 rows × 32 columns

len(data)
26307
data.describe()
year month odometer_reading rating_engineTransmission
count 26307.000000 26307.000000 26307.000000 26307.000000
mean 2010.856578 5.462006 76460.143764 3.624663
std 3.766234 3.583866 46762.524489 0.847645
min 1989.000000 1.000000 1.000000 0.500000
25% 2008.000000 2.000000 46396.000000 3.500000
50% 2011.000000 5.000000 72013.000000 4.000000
75% 2014.000000 9.000000 98289.500000 4.000000
max 2019.000000 12.000000 999999.000000 5.000000
data_cols=data.columns
data_cols
Index(['appointmentId', 'inspectionStartTime', 'year', 'month',
       'engineTransmission_battery_value',
       'engineTransmission_battery_cc_value_0',
       'engineTransmission_battery_cc_value_1',
       'engineTransmission_battery_cc_value_2',
       'engineTransmission_battery_cc_value_3',
       'engineTransmission_battery_cc_value_4',
       'engineTransmission_engineoilLevelDipstick_value',
       'engineTransmission_engineOilLevelDipstick_cc_value_0',
       'engineTransmission_engineOil',
       'engineTransmission_engineOil_cc_value_0',
       'engineTransmission_engineOil_cc_value_1',
       'engineTransmission_engineOil_cc_value_2',
       'engineTransmission_engineOil_cc_value_3',
       'engineTransmission_engineOil_cc_value_4',
       'engineTransmission_engineOil_cc_value_5',
       'engineTransmission_engineOil_cc_value_6',
       'engineTransmission_gearShifting_value',
       'engineTransmission_gearShifting_cc_value_0',
       'engineTransmission_gearShifting_cc_value_1',
       'engineTransmission_gearShifting_cc_value_2',
       'engineTransmission_comments_value_0',
       'engineTransmission_comments_value_1',
       'engineTransmission_comments_value_2',
       'engineTransmission_comments_value_3',
       'engineTransmission_comments_value_4', 'fuel_type', 'odometer_reading',
       'rating_engineTransmission'],
      dtype='object')
data.isnull().sum()
percentage_data_missing=((data.isnull().sum())/len(data))*100
percentage_data_missing
appointmentId                                            0.000000
inspectionStartTime                                      0.000000
year                                                     0.000000
month                                                    0.000000
engineTransmission_battery_value                         0.000000
engineTransmission_battery_cc_value_0                   86.931235
engineTransmission_battery_cc_value_1                   98.365454
engineTransmission_battery_cc_value_2                   99.726309
engineTransmission_battery_cc_value_3                   99.939180
engineTransmission_battery_cc_value_4                   99.984795
engineTransmission_engineoilLevelDipstick_value          0.000000
engineTransmission_engineOilLevelDipstick_cc_value_0    98.437678
engineTransmission_engineOil                             0.000000
engineTransmission_engineOil_cc_value_0                 29.459840
engineTransmission_engineOil_cc_value_1                 58.170829
engineTransmission_engineOil_cc_value_2                 74.938229
engineTransmission_engineOil_cc_value_3                 85.775649
engineTransmission_engineOil_cc_value_4                 93.264150
engineTransmission_engineOil_cc_value_5                 97.685027
engineTransmission_engineOil_cc_value_6                 99.540046
engineTransmission_gearShifting_value                    0.000000
engineTransmission_gearShifting_cc_value_0              86.832402
engineTransmission_gearShifting_cc_value_1              97.069221
engineTransmission_gearShifting_cc_value_2              99.749116
engineTransmission_comments_value_0                     84.544038
engineTransmission_comments_value_1                     95.027939
engineTransmission_comments_value_2                     98.517505
engineTransmission_comments_value_3                     99.775725
engineTransmission_comments_value_4                     99.946782
fuel_type                                                0.000000
odometer_reading                                         0.000000
rating_engineTransmission                                0.000000
dtype: float64
data = data.replace([0,' ','NaN'],np.nan)
data=data.dropna(thresh=data.shape[0]*0.8,how='all',axis=1)
len(data)
26307
data.isnull().sum()
appointmentId                                      0
inspectionStartTime                                0
year                                               0
month                                              0
engineTransmission_battery_value                   0
engineTransmission_engineoilLevelDipstick_value    0
engineTransmission_engineOil                       0
engineTransmission_gearShifting_value              0
fuel_type                                          0
odometer_reading                                   0
rating_engineTransmission                          0
dtype: int64
df=data
for x in df:
    if df[x].dtypes == "int64":
        df[x] = df[x].astype(float)
        print (df[x].dtypes)
float64
float64
float64
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26307 entries, 0 to 26306
Data columns (total 11 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   appointmentId                                    26307 non-null  object 
 1   inspectionStartTime                              26307 non-null  object 
 2   year                                             26307 non-null  float64
 3   month                                            26307 non-null  float64
 4   engineTransmission_battery_value                 26307 non-null  object 
 5   engineTransmission_engineoilLevelDipstick_value  26307 non-null  object 
 6   engineTransmission_engineOil                     26307 non-null  object 
 7   engineTransmission_gearShifting_value            26307 non-null  object 
 8   fuel_type                                        26307 non-null  object 
 9   odometer_reading                                 26307 non-null  float64
 10  rating_engineTransmission                        26307 non-null  float64
dtypes: float64(4), object(7)
memory usage: 2.2+ MB
df = df.select_dtypes(exclude=['object'])
df=df.fillna(df.mean())
X = df.drop('rating_engineTransmission',axis=1)
y = df['rating_engineTransmission']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 1000, random_state = 42)
regressor.fit(X_train, y_train)
RandomForestRegressor(n_estimators=1000, random_state=42)
y_pred = regressor.predict(X_test)
df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
df
Actual Predicted
2930 4.0 3.0440
13347 3.5 3.6040
5211 1.0 2.8520
2826 3.5 3.9860
19333 3.5 3.3250
... ... ...
13809 4.5 4.2505
7944 3.5 3.8705
570 4.0 3.5235
16688 4.5 3.8845
3494 4.5 4.5770

7893 rows × 2 columns

from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
Mean Absolute Error: 0.47982060151249145
Mean Squared Error: 0.423646705015395
Root Mean Squared Error: 0.6508814830792123
errors = abs(y_pred - y_test)
map_ = 100 * (errors / y_test)
accuracy = 100 - np.mean(map_)
print('Accuracy:', round(accuracy, 2), '%.')
Accuracy: 82.95 %.
plt.figure(figsize=(9,6)) 
sns.heatmap(data.corr(),annot=True,cmap='cubehelix_r')
plt.show()
data['rating_engineTransmission'].value_counts().plot.bar(figsize=(8,6), title='Rating_engine')
<AxesSubplot:title={'center':'Rating_engine'}>
bins=[0,2,3,4,4.5] 
group=['Low','Average','High', 'Very high'] 
data['Rating_bin']=pd.cut(data['rating_engineTransmission'],bins,labels=group)
Rating_bin=pd.crosstab(data['Rating_bin'],data['fuel_type']) 
Rating_bin.div(Rating_bin.sum(1).astype(float), axis=0).plot(figsize=(20,10),kind="bar", stacked=True)
plt.xlabel('Rating')
plt.ylabel('Percentage')
Text(0, 0.5, 'Percentage')
model = sm.OLS(y,X)
results = model.fit()
results.summary()
OLS Regression Results
Dep. Variable: rating_engineTransmission R-squared (uncentered): 0.956
Model: OLS Adj. R-squared (uncentered): 0.956
Method: Least Squares F-statistic: 1.906e+05
Date: Wed, 04 Nov 2020 Prob (F-statistic): 0.00
Time: 16:25:14 Log-Likelihood: -30815.
No. Observations: 26307 AIC: 6.164e+04
Df Residuals: 26304 BIC: 6.166e+04
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
year 0.0020 5.88e-06 336.860 0.000 0.002 0.002
month 0.0257 0.001 19.114 0.000 0.023 0.028
odometer_reading -6.523e-06 1.03e-07 -63.443 0.000 -6.72e-06 -6.32e-06
Omnibus: 6769.641 Durbin-Watson: 1.997
Prob(Omnibus): 0.000 Jarque-Bera (JB): 18964.773
Skew: -1.362 Prob(JB): 0.00
Kurtosis: 6.143 Cond. No. 2.50e+04


Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[3] The condition number is large, 2.5e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
!pip freeze > requirements
filename = 'celebaltech.sav'
pickle.dump(regressor, open(filename, 'wb'))