Dr R Anurekha: Regression models

Regression models

Expt No: 5 Regression models

Date:

Aim: To write a program to demonstrate various Regression models

Program

# Linear Regression, Bayesian Linear Regression and Polynomial Regression

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, BayesianRidge

from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import mean_squared_error

from sklearn.impute import SimpleImputer

import numpy as np

import matplotlib.pyplot as plt

# Load the dataset

df = pd.read_csv('HousingData.csv')

# Assume 'MEDV' as the dependent variable and the rest as independent variables

X = df.drop('MEDV', axis=1)

y = df['MEDV']

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values using simple imputation with mean

imputer = SimpleImputer(strategy='mean')

X_train_imputed = imputer.fit_transform(X_train)

X_test_imputed = imputer.transform(X_test)

# Linear Regression

lin_reg = LinearRegression()

lin_reg.fit(X_train_imputed, y_train)

lin_reg_train_pred = lin_reg.predict(X_train_imputed)

lin_reg_test_pred = lin_reg.predict(X_test_imputed)

# Bayesian Linear Regression

bayesian_reg = BayesianRidge()

bayesian_reg.fit(X_train_imputed, y_train)

bayesian_reg_train_pred = bayesian_reg.predict(X_train_imputed)

bayesian_reg_test_pred = bayesian_reg.predict(X_test_imputed)

# Polynomial Regression (degree=2)

poly_reg = PolynomialFeatures(degree=2)

X_train_poly = poly_reg.fit_transform(X_train_imputed)

X_test_poly = poly_reg.transform(X_test_imputed)

poly_lin_reg = LinearRegression()

poly_lin_reg.fit(X_train_poly, y_train)

poly_lin_reg_train_pred = poly_lin_reg.predict(X_train_poly)

poly_lin_reg_test_pred = poly_lin_reg.predict(X_test_poly)

# Calculate mean squared error

lin_reg_train_mse = mean_squared_error(y_train, lin_reg_train_pred)

lin_reg_test_mse = mean_squared_error(y_test, lin_reg_test_pred)

bayesian_reg_train_mse = mean_squared_error(y_train, bayesian_reg_train_pred)

bayesian_reg_test_mse = mean_squared_error(y_test, bayesian_reg_test_pred)

poly_lin_reg_train_mse = mean_squared_error(y_train, poly_lin_reg_train_pred)

poly_lin_reg_test_mse = mean_squared_error(y_test, poly_lin_reg_test_pred)

print("Linear Regression:")

print(f" Train MSE: {lin_reg_train_mse:.2f}")

print(f" Test MSE: {lin_reg_test_mse:.2f}")

print("Bayesian Linear Regression:")

print(f" Train MSE: {bayesian_reg_train_mse:.2f}")

print(f" Test MSE: {bayesian_reg_test_mse:.2f}")

print("Polynomial Regression (degree=2):")

print(f" Train MSE: {poly_lin_reg_train_mse:.2f}")

print(f" Test MSE: {poly_lin_reg_test_mse:.2f}")

# Plot actual vs predicted prices

plt.figure(figsize=(12, 6))

plt.scatter(y_test, lin_reg_test_pred, color='blue', label='Linear Regression')

plt.scatter(y_test, bayesian_reg_test_pred, color='green', label='Bayesian Linear Regression')

plt.scatter(y_test, poly_lin_reg_test_pred, color='red', label='Polynomial Regression (degree=2)')

plt.xlabel('Actual Price')

plt.ylabel('Predicted Price')

plt.title('Actual vs Predicted Prices (Regression)')

plt.legend()

plt.show()

# Plot actual vs predicted prices with the fitted line for linear regression

plt.figure(figsize=(12, 6))

plt.scatter(y_test, lin_reg_test_pred, color='blue', label='Linear Regression')

plt.xlabel('Actual Price')

plt.ylabel('Predicted Price')

plt.title('Actual vs Predicted Prices for Linear Regression')

plt.legend()

plt.show()

# Plot actual vs predicted prices with the fitted line for polynomial regression

plt.figure(figsize=(12, 6))

plt.scatter(y_test, poly_lin_reg_test_pred, color='red', label='Polynomial Regression (degree=2)')

plt.xlabel('Actual Price')

plt.ylabel('Predicted Price')

plt.title('Actual vs Predicted Prices for Polynomial Regression')

plt.legend()

plt.show()

# Plot actual vs predicted prices for Bayesian Linear Regression

plt.figure(figsize=(12, 6))

plt.scatter(y_test, bayesian_reg_test_pred, color='green', label='Bayesian Linear Regression')

plt.xlabel('Actual Price')

plt.ylabel('Predicted Price')

plt.title('Actual vs Predicted Prices for Bayesian Linear Regression')

plt.legend()

plt.show()

# Logistic Regression

# Single variable

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

# Generate synthetic dataset with multiple features

np.random.seed(42)

n_samples = 1000

# Generate features: transaction amount, transaction time, and transaction type

transaction_amount = np.random.normal(loc=50, scale=20, size=n_samples)

transaction_time = np.random.uniform(low=0, high=24, size=n_samples) # Transaction time in hours

transaction_type = np.random.choice(['Online', 'In-person'], size=n_samples)

# Generate target variable: is_fraudulent

# Assume transactions made between 1:00 AM and 6:00 AM, online transactions,

# and high transaction amounts have a higher probability of being fraudulent

is_fraudulent = (((transaction_time >= 1) & (transaction_time <= 6)) |

(transaction_type == 'Online') |

(transaction_amount > 70)).astype(int)

# Create DataFrame

df = pd.DataFrame({

'TransactionAmount': transaction_amount,

'TransactionTime': transaction_time,

'TransactionType': transaction_type,

'IsFraudulent': is_fraudulent

})

# One-hot encode the 'TransactionType' feature

df = pd.get_dummies(df, columns=['TransactionType'])

# Separate datasets based on each feature for logistic regression

datasets = [('Transaction Amount', df[['TransactionAmount']]),

('Transaction Time', df[['TransactionTime']]),

('Transaction Type', df.drop(['TransactionAmount', 'TransactionTime', 'IsFraudulent'], axis=1))]

# Perform logistic regression for each feature

for feature_name, X_feature in datasets:

X_train, X_test, y_train, y_test = train_test_split(X_feature, df['IsFraudulent'], test_size=0.2, random_state=42)

model = LogisticRegression(solver='liblinear')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy based on {feature_name} only : {accuracy:.2f}")

print()

# Logistic Regression

# Multiple variables

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

# Generate synthetic dataset with multiple features

np.random.seed(42)

n_samples = 1000

# Generate features: transaction amount, transaction time, and transaction type

transaction_amount = np.random.normal(loc=50, scale=20, size=n_samples)

transaction_time = np.random.uniform(low=0, high=24, size=n_samples) # Transaction time in hours

transaction_type = np.random.choice(['Online', 'In-person'], size=n_samples)

# Generate target variable: is_fraudulent

# Assume transactions made between 1:00 AM and 6:00 AM, online transactions,

# and high transaction amounts have a higher probability of being fraudulent

is_fraudulent = (((transaction_time >= 1) & (transaction_time <= 6)) |

(transaction_type == 'Online') |

(transaction_amount > 70)).astype(int)

# Create DataFrame

df = pd.DataFrame({

'TransactionAmount': transaction_amount,

'TransactionTime': transaction_time,

'TransactionType': transaction_type,

'IsFraudulent': is_fraudulent

})

# One-hot encode the 'TransactionType' feature

df = pd.get_dummies(df, columns=['TransactionType'])

# Split the data into training and testing sets

X = df.drop('IsFraudulent', axis=1)

y = df['IsFraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit logistic regression model

model = LogisticRegression(solver='liblinear')

model.fit(X_train, y_train)

# Predict the classes for the test set

y_pred = model.predict(X_test)

# Calculate accuracy

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy involving all three variables : {accuracy:.2f}")

Result: Thus the program to demonstrate Regression models were written and executed.

Sample Output

Linear Regression, Bayesian Linear Regression and Polynomial Regression:

Linear Regression:

Train MSE: 22.40

Test MSE: 25.00

Bayesian Linear Regression:

Train MSE: 23.09

Test MSE: 25.30

Polynomial Regression (degree=2):

Train MSE: 6.53

Test MSE: 16.46

# Logistic Regression

# Single variables

Accuracy based on Transaction Amount only : 0.67

Accuracy based on Transaction Time only : 0.69

Accuracy based on Transaction Type only : 0.86

# Logistic Regression

# Multiple variables

Accuracy involving all three variables : 0.91

Dr R Anurekha

Regression models

No comments:

Post a Comment