Regression models

 

Expt No: 5                                           Regression models

Date:

 

Aim: To write a program to demonstrate various Regression models

 

Program

 

# Linear Regression, Bayesian Linear Regression and Polynomial Regression

 

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, BayesianRidge

from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import mean_squared_error

from sklearn.impute import SimpleImputer

import numpy as np

import matplotlib.pyplot as plt

 

# Load the dataset

df = pd.read_csv('HousingData.csv')

 

# Assume 'MEDV' as the dependent variable and the rest as independent variables

X = df.drop('MEDV', axis=1)

y = df['MEDV']

 

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

 

# Handle missing values using simple imputation with mean

imputer = SimpleImputer(strategy='mean')

X_train_imputed = imputer.fit_transform(X_train)

X_test_imputed = imputer.transform(X_test)

 

# Linear Regression

lin_reg = LinearRegression()

lin_reg.fit(X_train_imputed, y_train)

lin_reg_train_pred = lin_reg.predict(X_train_imputed)

lin_reg_test_pred = lin_reg.predict(X_test_imputed)

 


 

# Bayesian Linear Regression

bayesian_reg = BayesianRidge()

bayesian_reg.fit(X_train_imputed, y_train)

bayesian_reg_train_pred = bayesian_reg.predict(X_train_imputed)

bayesian_reg_test_pred = bayesian_reg.predict(X_test_imputed)

 

# Polynomial Regression (degree=2)

poly_reg = PolynomialFeatures(degree=2)

X_train_poly = poly_reg.fit_transform(X_train_imputed)

X_test_poly = poly_reg.transform(X_test_imputed)

poly_lin_reg = LinearRegression()

poly_lin_reg.fit(X_train_poly, y_train)

poly_lin_reg_train_pred = poly_lin_reg.predict(X_train_poly)

poly_lin_reg_test_pred = poly_lin_reg.predict(X_test_poly)

 

# Calculate mean squared error

lin_reg_train_mse = mean_squared_error(y_train, lin_reg_train_pred)

lin_reg_test_mse = mean_squared_error(y_test, lin_reg_test_pred)

bayesian_reg_train_mse = mean_squared_error(y_train, bayesian_reg_train_pred)

bayesian_reg_test_mse = mean_squared_error(y_test, bayesian_reg_test_pred)

poly_lin_reg_train_mse = mean_squared_error(y_train, poly_lin_reg_train_pred)

poly_lin_reg_test_mse = mean_squared_error(y_test, poly_lin_reg_test_pred)

 

print("Linear Regression:")

print(f"  Train MSE: {lin_reg_train_mse:.2f}")

print(f"  Test MSE: {lin_reg_test_mse:.2f}")

 

print("Bayesian Linear Regression:")

print(f"  Train MSE: {bayesian_reg_train_mse:.2f}")

print(f"  Test MSE: {bayesian_reg_test_mse:.2f}")

 

print("Polynomial Regression (degree=2):")

print(f"  Train MSE: {poly_lin_reg_train_mse:.2f}")

print(f"  Test MSE: {poly_lin_reg_test_mse:.2f}")

 


 

# Plot actual vs predicted prices

plt.figure(figsize=(12, 6))

plt.scatter(y_test, lin_reg_test_pred, color='blue', label='Linear Regression')

plt.scatter(y_test, bayesian_reg_test_pred, color='green', label='Bayesian Linear Regression')

plt.scatter(y_test, poly_lin_reg_test_pred, color='red', label='Polynomial Regression (degree=2)')

plt.xlabel('Actual Price')

plt.ylabel('Predicted Price')

plt.title('Actual vs Predicted Prices (Regression)')

plt.legend()

plt.show()

 

# Plot actual vs predicted prices with the fitted line for linear regression

plt.figure(figsize=(12, 6))

plt.scatter(y_test, lin_reg_test_pred, color='blue', label='Linear Regression')

plt.xlabel('Actual Price')

plt.ylabel('Predicted Price')

plt.title('Actual vs Predicted Prices for Linear Regression')

plt.legend()

plt.show()

 

# Plot actual vs predicted prices with the fitted line for polynomial regression

plt.figure(figsize=(12, 6))

plt.scatter(y_test, poly_lin_reg_test_pred, color='red', label='Polynomial Regression (degree=2)')

plt.xlabel('Actual Price')

plt.ylabel('Predicted Price')

plt.title('Actual vs Predicted Prices for Polynomial Regression')

plt.legend()

plt.show()

 

# Plot actual vs predicted prices for Bayesian Linear Regression

plt.figure(figsize=(12, 6))

plt.scatter(y_test, bayesian_reg_test_pred, color='green', label='Bayesian Linear Regression')

plt.xlabel('Actual Price')

plt.ylabel('Predicted Price')

plt.title('Actual vs Predicted Prices for Bayesian Linear Regression')

plt.legend()

plt.show()



# Logistic Regression

# Single variable

 

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

 

# Generate synthetic dataset with multiple features

np.random.seed(42)

n_samples = 1000

 

# Generate features: transaction amount, transaction time, and transaction type

transaction_amount = np.random.normal(loc=50, scale=20, size=n_samples)

transaction_time = np.random.uniform(low=0, high=24, size=n_samples)  # Transaction time in hours

transaction_type = np.random.choice(['Online', 'In-person'], size=n_samples)

 

# Generate target variable: is_fraudulent

# Assume transactions made between 1:00 AM and 6:00 AM, online transactions,

# and high transaction amounts have a higher probability of being fraudulent

is_fraudulent = (((transaction_time >= 1) & (transaction_time <= 6)) |

                 (transaction_type == 'Online') |

                 (transaction_amount > 70)).astype(int)

 

# Create DataFrame

df = pd.DataFrame({

    'TransactionAmount': transaction_amount,

    'TransactionTime': transaction_time,

    'TransactionType': transaction_type,

    'IsFraudulent': is_fraudulent

})

 

# One-hot encode the 'TransactionType' feature

df = pd.get_dummies(df, columns=['TransactionType'])

 


 

# Separate datasets based on each feature for logistic regression

datasets = [('Transaction Amount', df[['TransactionAmount']]),

            ('Transaction Time', df[['TransactionTime']]),

            ('Transaction Type', df.drop(['TransactionAmount', 'TransactionTime', 'IsFraudulent'], axis=1))]

 

# Perform logistic regression for each feature

for feature_name, X_feature in datasets:

    X_train, X_test, y_train, y_test = train_test_split(X_feature, df['IsFraudulent'], test_size=0.2, random_state=42)

    model = LogisticRegression(solver='liblinear')

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy based on {feature_name} only : {accuracy:.2f}")

    print()

 


 

# Logistic Regression

# Multiple variables

 

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

 

# Generate synthetic dataset with multiple features

np.random.seed(42)

n_samples = 1000

 

# Generate features: transaction amount, transaction time, and transaction type

transaction_amount = np.random.normal(loc=50, scale=20, size=n_samples)

transaction_time = np.random.uniform(low=0, high=24, size=n_samples)  # Transaction time in hours

transaction_type = np.random.choice(['Online', 'In-person'], size=n_samples)

 

# Generate target variable: is_fraudulent

# Assume transactions made between 1:00 AM and 6:00 AM, online transactions,

# and high transaction amounts have a higher probability of being fraudulent

is_fraudulent = (((transaction_time >= 1) & (transaction_time <= 6)) |

                 (transaction_type == 'Online') |

                 (transaction_amount > 70)).astype(int)

 

# Create DataFrame

df = pd.DataFrame({

    'TransactionAmount': transaction_amount,

    'TransactionTime': transaction_time,

    'TransactionType': transaction_type,

    'IsFraudulent': is_fraudulent

})

 

# One-hot encode the 'TransactionType' feature

df = pd.get_dummies(df, columns=['TransactionType'])

 

# Split the data into training and testing sets

X = df.drop('IsFraudulent', axis=1)

y = df['IsFraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

 

# Fit logistic regression model

model = LogisticRegression(solver='liblinear')

model.fit(X_train, y_train)

 

# Predict the classes for the test set

y_pred = model.predict(X_test)

 

# Calculate accuracy

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy involving all three variables : {accuracy:.2f}")

 

 

 

 

ResultThus the program to demonstrate Regression models were written and executed.



Sample Output

Linear Regression, Bayesian Linear Regression and Polynomial Regression:

 

Linear Regression:

  Train MSE: 22.40

  Test MSE: 25.00

 

Bayesian Linear Regression:

  Train MSE: 23.09

  Test MSE: 25.30

 

Polynomial Regression (degree=2):

  Train MSE: 6.53

  Test MSE: 16.46










 

# Logistic Regression

# Single variables

Accuracy based on Transaction Amount only : 0.67

Accuracy based on Transaction Time only : 0.69

Accuracy based on Transaction Type only : 0.86

 

# Logistic Regression

# Multiple variables

Accuracy involving all three variables : 0.91






No comments:

Post a Comment

Don't be a silent reader...
Leave your comments...

Anu