Electric Automobile Data Analysis
Here we see some important Data Analysis tecnique that we come across while working with the Electric automobile Dataset
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib
import seaborn as sns
import statsmodels.api as sm
import pickle
data=pd.read_csv("/home/gsaksham/Downloads/data.csv")
data.head()
len(data)
data.describe()
data_cols=data.columns
data_cols
data.isnull().sum()
percentage_data_missing=((data.isnull().sum())/len(data))*100
percentage_data_missing
data = data.replace([0,' ','NaN'],np.nan)
data=data.dropna(thresh=data.shape[0]*0.8,how='all',axis=1)
len(data)
data.isnull().sum()
df=data
for x in df:
if df[x].dtypes == "int64":
df[x] = df[x].astype(float)
print (df[x].dtypes)
df.info()
df = df.select_dtypes(exclude=['object'])
df=df.fillna(df.mean())
X = df.drop('rating_engineTransmission',axis=1)
y = df['rating_engineTransmission']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 1000, random_state = 42)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
df
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
errors = abs(y_pred - y_test)
map_ = 100 * (errors / y_test)
accuracy = 100 - np.mean(map_)
print('Accuracy:', round(accuracy, 2), '%.')
plt.figure(figsize=(9,6))
sns.heatmap(data.corr(),annot=True,cmap='cubehelix_r')
plt.show()
data['rating_engineTransmission'].value_counts().plot.bar(figsize=(8,6), title='Rating_engine')
bins=[0,2,3,4,4.5]
group=['Low','Average','High', 'Very high']
data['Rating_bin']=pd.cut(data['rating_engineTransmission'],bins,labels=group)
Rating_bin=pd.crosstab(data['Rating_bin'],data['fuel_type'])
Rating_bin.div(Rating_bin.sum(1).astype(float), axis=0).plot(figsize=(20,10),kind="bar", stacked=True)
plt.xlabel('Rating')
plt.ylabel('Percentage')
model = sm.OLS(y,X)
results = model.fit()
results.summary()
!pip freeze > requirements
filename = 'celebaltech.sav'
pickle.dump(regressor, open(filename, 'wb'))