Exp 1: Write a Program to predict the price of the Uber ride from a given pickup point to the agreed drop-off location
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
train_data = pd.read_csv('/content/drive/MyDrive/ML Lab Sem 7/uber.csv')
train_data.head(2)
train_data.drop(labels='Unnamed: 0',axis=1,inplace=True)
train_data.drop(labels='key',axis=1,inplace=True)
train_data.info()
train_data['pickup_datetime'] = pd.to_datetime(train_data['pickup_datetime'],errors='coerce')
train_data.info()
train_data.isnull().sum() # to check no of null values
train_data.dropna(axis = 0, inplace= True)
train_data.isnull().sum()
train_data["pickup_latitude"].max()
min_longitude = -180
max_longitude = 180
min_latitude = -90
max_latitude = 90
invalid_rows = train_data[
(train_data["dropoff_latitude"] < min_latitude) |
(train_data["pickup_latitude"] < min_latitude) |
(train_data["dropoff_longitude"] < min_longitude) |
(train_data["pickup_longitude"] < min_longitude) |
(train_data["dropoff_latitude"] > max_latitude) |
(train_data["pickup_latitude"] > max_latitude) |
(train_data["dropoff_longitude"] > max_longitude) |
(train_data["pickup_longitude"] > max_longitude)
]
print("Before dropping:", train_data.shape)
train_data = train_data.drop(invalid_rows.index)
print("After dropping:", train_data.shape)
import calendar
train_data['day']=train_data['pickup_datetime'].apply(lambda x:x.day)
train_data['hour']=train_data['pickup_datetime'].apply(lambda x:x.hour)
train_data['month']=train_data['pickup_datetime'].apply(lambda x:x.month)
train_data['year']=train_data['pickup_datetime'].apply(lambda x:x.year)
train_data['weekday']=train_data['pickup_datetime'].apply(lambda x: calendar.day_name[x.weekday()])
train_data.weekday = train_data.weekday.map({'Sunday':0,'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6})
train_data.drop(labels = 'pickup_datetime',axis=1,inplace=True)
train_data.info()
from sklearn.model_selection import train_test_split
x=train_data.drop("fare_amount", axis=1)
y=train_data["fare_amount"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=101)
# Linear regression training
from sklearn.linear_model import LinearRegression
linear_model=LinearRegression()
linear_model.fit(x_train, y_train)
predictedvalues = linear_model.predict(x_test)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score, precision_score, accuracy_score
from sklearn.metrics import r2_score
lrmodelrmse = np.sqrt(mean_squared_error(predictedvalues, y_test))
mae = mean_absolute_error(y_test, predictedvalues)
r2 = r2_score(y_test, predictedvalues)
print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
print("RMSE value for Linear regression is", lrmodelrmse)
# random forest model training
from sklearn.ensemble import RandomForestRegressor # time to train: 4m 50s
rfrmodel = RandomForestRegressor(n_estimators=100, random_state=101)
rfrmodel.fit(x_train,y_train)
rfrmodel_pred= rfrmodel.predict(x_test)
# Evaluation
rfrmodel_rmse=np.sqrt(mean_squared_error(rfrmodel_pred, y_test))
rfr_mae = mean_absolute_error(rfrmodel_pred, y_test)
rfr_r2 = r2_score(rfrmodel_pred, y_test)
print("RMSE value for Random forest regression is ",rfrmodel_rmse)
print(f"R-squared: {rfr_r2}")
print(f"Mean Absolute Error: {rfr_mae}")
# Test data preprocessing
test = pd.read_csv(r'/content/drive/MyDrive/ML Lab Sem 7/testt.csv')
test.drop(test[['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.2','key']],axis=1,inplace=True)
test["pickup_datetime"] = pd.to_datetime(test["pickup_datetime"])
test['day']=test['pickup_datetime'].apply(lambda x:x.day)
test['hour']=test['pickup_datetime'].apply(lambda x:x.hour)
test['month']=test['pickup_datetime'].apply(lambda x:x.month)
test['year']=test['pickup_datetime'].apply(lambda x:x.year)
test['weekday']=test['pickup_datetime'].apply(lambda x: calendar.day_name[x.weekday()])
test.drop(['pickup_datetime'], axis = 1, inplace = True)
test.weekday = test.weekday.map({'Sunday':0,'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6})
test.isnull().sum()
test.dropna(axis = 0, inplace= True)
rfrmodel_pred= rfrmodel.predict(test)
df = pd.DataFrame(rfrmodel_pred)
df
Experiment 2
Exp 2: Write a program to classify the email as spam or not using the binary classification method. Use KNN and SVM for classification and analyze their performance.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report, mean_squared_error,f1_score,precision_score
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("/content/drive/MyDrive/ML Lab Sem 7/emails.csv")
df.head(5)
df.shape
df.isnull().sum()
# defining input varibles and target variable
X = df.iloc[:,1:3001]
Y = df.iloc[:,-1].values
train_x,test_x,train_y,test_y = train_test_split(X,Y,test_size = 0.25)
# SVM model
svc = SVC(C=1.0,kernel='rbf',gamma='auto')
svc.fit(train_x,train_y) # training
y_pred2 = svc.predict(test_x)
# Confusion Matrix
conf_matrix_svm = confusion_matrix(test_y, y_pred2)
mse_svm = mean_squared_error(test_y, y_pred2)
rmse_svm = np.sqrt(mse_svm)
# Plot Confusion Matrix
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix_svm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Spam (0)', 'Spam (1)'], yticklabels=['Not Spam (0)', 'Spam (1)'])
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Print classification metrics
print("Classification Report:\n", classification_report(test_y, y_pred2))
print("Accuracy:", accuracy_score(test_y, y_pred2))
print("Precision:", precision_score(test_y, y_pred2))
print("SVM F1-Score:", f1_score(test_y, y_pred2))
print("Root Mean Squared Error (RMSE):", rmse_svm)
# KNN model
# Initialize the KNN classifier with k=5
knn = KNeighborsClassifier(n_neighbors=5)
# Train the KNN model
knn.fit(train_x, train_y)
# Predict on the test data
y_pred = knn.predict(test_x)
# Confusion Matrix
conf_matrix = confusion_matrix(test_y, y_pred)
mse_knn = mean_squared_error(test_y, y_pred2)
rmse_knn = np.sqrt(mse_knn)
# Plot the Confusion Matrix
plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Spam (0)', 'Spam (1)'], yticklabels=['Not Spam (0)', 'Spam (1)'])
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Print classification metrics like Precision, Recall, F1 Score
print("Classification Report:\n", classification_report(test_y, y_pred))
print("Accuracy:", accuracy_score(test_y, y_pred))
print("Precision:", precision_score(test_y, y_pred))
print("KNN F1-Score:", f1_score(test_y, y_pred))
print("Root Mean Squared Error (RMSE):", rmse_knn)
Experiment Exp 3
Exp 3: Given a bank customer, build a neural network-based classifier that can determine whether they will leave or not in the next 6 months.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, f1_score, precision_score, classification_report
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/ML Lab Sem 7/Churn_Modelling.csv')
# Preprocessing
df = pd.get_dummies(df, columns=['Gender', 'Geography'], drop_first=True) # One-hot encoding
X = df.drop(['CustomerId', 'Exited','RowNumber','Surname'], axis=1) # Drop unncessary column
y = df['Exited'] # 'Exited' is the target variable
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Create the neural network model
model = keras.Sequential([
layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)), # Input layer
layers.Dense(16, activation='relu'), # Hidden layer
layers.Dense(1, activation='sigmoid') # Output layer for binary classification
])
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.2f}')
print(f'Loss: {test_loss}')
y_pred = model.predict(X_test) # Predict probabilities
y_pred_classes = np.where(y_pred > 0.5, 1, 0) # Convert probabilities to binary predictions
# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_classes)
# Calculate F1-score and Precision
f1 = f1_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes)
# Print metrics
print(f'Test Accuracy: {test_accuracy:.2f}')
print(f'Loss: {test_loss}')
print(f'F1 Score: {f1:.2f}')
print(f'Precision: {precision:.2f}')
print(classification_report(y_test, y_pred_classes)) # Detailed metrics
# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Will Not Leave', 'Will Leave'],
yticklabels=['Will Not Leave', 'Will Leave'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Customer Churn Prediction')
plt.show()
Experiment 4
Exp 4: Exp 4: Implement Gradient Descent Algorithm to find the local minima of a function
import numpy as np
import sympy as sp
def gradient_descent(func, initial_x, learning_rate=0.01, max_iterations=1000, tolerance=1e-6):
x = sp.symbols('x')
# Calculate the derivative of the function
derivative = sp.diff(func, x)
current_x = initial_x
for _ in range(max_iterations):
# Evaluate the derivative at the current x value
grad = float(derivative.subs(x, current_x))
# Update the current x value
new_x = current_x - learning_rate * grad
# Check for convergence
if abs(new_x - current_x) < tolerance:
break
current_x = new_x
return current_x, func.subs(x, current_x)
def main():
# Take user input for the function and initial x value
func_input = input("Enter a function of x (e.g., x**2 + 3*x + 5): ")
initial_x = float(input("Enter the initial value of x: "))
# Convert user input to a sympy expression
func = sp.sympify(func_input)
# local minima using gradient descent
minima_x, minima_y = gradient_descent(func, initial_x)
print(f"The local minima occurs at x = {minima_x}, with a function value of f(x) = {minima_y}")
main()
Experiment 5
Exp 5: Exp 5: Implement K-Nearest Neighbors algorithm on diabetes.csv dataset. Compute confusion matrix, accuracy, error rate, precision and recall on the given dataset.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/ML Lab Sem 7/diabetes.csv')
data.isnull().sum()
# Fill missing values with the mean for Glucose, BloodPressure and the median for other features
data['Glucose'] = data['Glucose'].fillna(data['Glucose'].mean())
data['BloodPressure'] = data['BloodPressure'].fillna(data['BloodPressure'].mean())
data['SkinThickness'] = data['SkinThickness'].fillna(data['SkinThickness'].median())
data['Insulin'] = data['Insulin'].fillna(data['Insulin'].median())
data['BMI'] = data['BMI'].fillna(data['BMI'].median())
# Exploratory Data Analysis (EDA) - Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), annot=True, cmap='RdYlGn')
plt.show()
# Standardize the dataset
X = data.drop("Outcome", axis=1)
y = data['Outcome']
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)
# KNN model tuning using GridSearchCV and cross-validation
parameters_grid = {'n_neighbors': np.arange(1, 30)}
knn = KNeighborsClassifier()
knn_grid_search = GridSearchCV(knn, param_grid=parameters_grid, cv=5)
knn_grid_search.fit(X_train, y_train)
# Best K value found by GridSearchCV
print(f"Best K value: {knn_grid_search.best_params_['n_neighbors']}")
# Train the KNN classifier with the optimal K value
knn_final = KNeighborsClassifier(n_neighbors=knn_grid_search.best_params_['n_neighbors'])
knn_final.fit(X_train, y_train)
# Now, you can make predictions on the test set
y_pred = knn_final.predict(X_test)
# Plot confusion matrix as a heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Diabetic', 'Diabetic'], yticklabels=['Non-Diabetic', 'Diabetic'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
# Compute accuracy, error rate, precision, and recall
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(f"Error Rate: {error_rate:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
# Make a single prediction (example input data)
new_data = [[2, 150, 85, 35, 0, 33.6, 0.627, 50]]
new_data_scaled = scaler.transform(new_data)
prediction = knn_final.predict(new_data_scaled)
print(f"Prediction for the new input data: {'Diabetic' if prediction[0] == 1 else 'Non-Diabetic'}")
Experiment 6
Exp 6: Implement hierarchical clustering on sales_data_sample.csv dataset. Determine the number of clusters using the elbow method.
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/content/drive/MyDrive/ML Lab Sem 7/sales_data_sample.csv', encoding='ISO-8859-1')
# Step 2: Select relevant features for clustering
features = ['QUANTITYORDERED', 'PRICEEACH', 'SALES', 'QTR_ID', 'MONTH_ID', 'YEAR_ID', 'MSRP']
data = df[features]
# Step 3: Handle missing values by filling with column mean
data = data.fillna(data.mean())
# Step 4: Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
# Step 5: Determine the optimal number of clusters using the Elbow method
def plot_elbow_method(data):
distortions = []
K = range(2, 11) # Check clusters from 2 to 10
for k in K:
model = AgglomerativeClustering(n_clusters=k)
model.fit(data)
distortions.append(model.inertia_ if hasattr(model, 'inertia_') else np.sum(model.fit_predict(data)))
plt.figure(figsize=(8, 5))
plt.plot(K, distortions, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.title('Elbow Method for Optimal k')
plt.show()
plot_elbow_method(scaled_data)
# Step 6: Plot a dendrogram for hierarchical clustering
linked = linkage(scaled_data, method='ward')
plt.figure(figsize=(10, 7))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title("Dendrogram")
plt.show()
# Step 7: Automatically determine the best number of clusters using silhouette score
def find_optimal_clusters(data):
max_score = -1
best_k = 2
for k in range(2, 11):
model = AgglomerativeClustering(n_clusters=k)
cluster_labels = model.fit_predict(data)
score = silhouette_score(data, cluster_labels)
print(f'Clusters: {k}, Silhouette Score: {score:.4f}')
if score > max_score:
max_score = score
best_k = k
return best_k
optimal_clusters = find_optimal_clusters(scaled_data)
print(f'Optimal number of clusters based on silhouette score: {optimal_clusters}')
# Step 8: Perform hierarchical clustering with Agglomerative Clustering
final_model = AgglomerativeClustering(n_clusters=optimal_clusters)
clusters = final_model.fit_predict(scaled_data)
# Add the cluster labels to the original dataset
df['Cluster'] = clusters
# Step 8: Perform PCA for 3D visualization
pca = PCA(n_components=3)
pca_data = pca.fit_transform(scaled_data)
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(pca_data[:, 0], pca_data[:, 1], pca_data[:, 2], c=clusters, cmap='viridis')
# Adding labels and legend
ax.set_xlabel('PC 1')
ax.set_ylabel('PC 2')
ax.set_zlabel('PC 3')
plt.colorbar(scatter, ax=ax, label='Cluster')
plt.title('3D PCA of Clusters')
plt.show()