Expt No: 10 Expectation–Maximization (EM)
algorithm
Date:
Aim: To write a program to demonstrate how
missing values are handled using SimpleImputer and Expectation–Maximization
(EM) algorithms
Program
# SimpleImputer
method
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
# Step 1: Load the dataset with missing values
iris_data = pd.read_csv("Iris.csv")
# Step 2: Introduce missing values in a specific
feature (e.g., Petal length)
feature_with_missing_values = "PetalLength"
missing_percentage = 0.2
# Randomly select indices to introduce missing values
missing_indices =
iris_data.sample(frac=missing_percentage, random_state=42).index
# Set the selected indices to NaN in the chosen
feature
iris_data.loc[missing_indices,
feature_with_missing_values] = None
# Step 3: Define features and target
X = iris_data.drop("Species", axis=1)
y = iris_data["Species"]
# Step 4: Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
y, test_size=0.2, random_state=42)
# Step 5: Preprocess the data to handle missing values
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
# Step 6: Train a Naive Bayes classifier
classifier = GaussianNB()
classifier.fit(X_train_imputed, y_train)
# Step 7: Make predictions
y_pred = classifier.predict(X_test_imputed)
# Step 8: Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy after imputing missing values with SimpleImputer:
{:.4f}".format(accuracy))
# EM
Algorithm
import pandas as pd
from sklearn.model_selection
import train_test_split
from sklearn.naive_bayes
import GaussianNB
from sklearn.metrics import
accuracy_score
from sklearn.mixture import
GaussianMixture
import os
import warnings
os.environ["OMP_NUM_THREADS"]
= "1"
warnings.filterwarnings("ignore",
category=UserWarning)
# Step 1: Load the dataset
with missing values
iris_data =
pd.read_csv("Iris.csv")
# Step 2: Introduce missing
values in a specific feature (e.g., PetalLength)
feature_with_missing_values
= "PetalLength"
missing_percentage = 0.2
# Randomly select indices to
introduce missing values
missing_indices =
iris_data.sample(frac=missing_percentage, random_state=42).index
# Set the selected indices
to NaN in the chosen feature
iris_data.loc[missing_indices,
feature_with_missing_values] = None
# Step 3: Define features
and target
X =
iris_data.drop("Species", axis=1)
y =
iris_data["Species"]
# Step 4: Split the dataset
into train and test sets
X_train, X_test, y_train,
y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 5: Initialize GMM
gmm =
GaussianMixture(n_components=3, random_state=42)
# Step 6: Impute missing
values using EM algorithm
X_train_imputed =
X_train.copy()
X_test_imputed =
X_test.copy()
for feature in
X_train_imputed.columns:
missing_train_indices =
X_train_imputed[X_train_imputed[feature].isnull()].index
missing_test_indices =
X_test_imputed[X_test_imputed[feature].isnull()].index
# Fit GMM on non-missing values
gmm.fit(X_train_imputed.loc[~X_train_imputed.index.isin(missing_train_indices),
[feature]])
# Impute missing values using GMM
n_samples_train =
max(len(missing_train_indices), 1)
n_samples_test =
max(len(missing_test_indices), 1)
X_train_imputed.loc[missing_train_indices, [feature]] = gmm.sample(n_samples=n_samples_train)[0]
X_test_imputed.loc[missing_test_indices, [feature]] = gmm.sample(n_samples=n_samples_test)[0]
# Step 7: Train a Naive
Bayes classifier
classifier = GaussianNB()
classifier.fit(X_train_imputed,
y_train)
# Step 8: Make predictions
y_pred =
classifier.predict(X_test_imputed)
# Step 9: Calculate accuracy
accuracy =
accuracy_score(y_test, y_pred)
print("Accuracy after imputing missing
values with EM algorithm: {:.4f}".format(accuracy))
Result: Thus the program to demonstrate how missing values are handled using SimpleImputer and Expectation–Maximization (EM) algorithmswas written and executed.
Sample Output
#
SimpleImputer method
Accuracy after imputing missing values with
SimpleImputer: 0.6333
# EM
Algorithm
Accuracy after imputing missing values with EM
algorithm: 0.7667
No comments:
Post a Comment
Don't be a silent reader...
Leave your comments...
Anu