线性回归

“实验目的:理解线性回归原理,学会使用最小二乘(Least Squares)方法解线性回归问题,掌握Gram-Schmidt正交化方法及其变体在最小二乘方法中的应用。

实验内容:

① 自行下载(或自己制作)合适的公开数据集,将数据集划分为“训练集”和“测试集”两部分。

###

② 采用最小二乘方法对线性模型y = Ax(A是训练集,x是模型参数,y是模型预测)进行拟合。

③ 采用Gram-Schmidt方法对训练集A进行正交化处理,获得新数据矩阵Q,利用Q再次求解线性模型。

④ 对原始数据矩阵A进行处理,使得某些特征间“接近相关但不相关”,获得新数据矩阵B,采用Gram-Schmidt方法对B进行正交化处理,获得新数据矩阵Q,利用Q再次求解线性模型。

⑤ 采用“better Gram-Schmidt”方法对数据矩阵B进行正交化处理,再次求解线性模型。

###

⑥ 对比上述四种方法在训练集和测试集上的均方误差,以图表形式体现。

⑦ 希望能够有自己的特色和创意!!!

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

# 加载鸢尾花数据集

iris = load_iris()

X, y = iris.data, iris.target

# 将数据集划分为训练集和测试集

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 第一种方法 - 最小二乘方法

# 添加偏置列

X_train_1 = np.hstack((np.ones((X_train.shape[0], 1)), X_train))

X_test_1 = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

# 计算最小二乘解

A_inv = np.linalg.inv(X_train_1.T @ X_train_1) @ X_train_1.T

weights_1 = A_inv @ y_train

# 计算训练集和测试集上的预测值

y_train_pred_1 = X_train_1 @ weights_1

y_test_pred_1 = X_test_1 @ weights_1

# 计算训练集和测试集上的均方误差

mse_train_1 = np.mean((y_train_pred_1 - y_train) ** 2)

mse_test_1 = np.mean((y_test_pred_1 - y_test) ** 2)

# 第二种方法 - Gram-Schmidt方法

def gram_schmidt(X):

    Q, R = np.linalg.qr(X)
    
    return Q

# 对训练集进行正交化处理

X_train_2 = gram_schmidt(X_train)

# 添加偏置列

X_train_2 = np.hstack((np.ones((X_train_2.shape[0], 1)), X_train_2))

# 计算最小二乘解

A_inv = np.linalg.inv(X_train_2.T @ X_train_2) @ X_train_2.T

weights_2 = A_inv @ y_train

# 对测试集进行正交化处理

X_test_2 = gram_schmidt(X_test)

X_test_2 = np.hstack((np.ones((X_test_2.shape[0], 1)), X_test_2))

# 计算训练集和测试集上的预测值

y_train_pred_2 = X_train_2 @ weights_2

y_test_pred_2 = X_test_2 @ weights_2

# 计算训练集和测试集上的均方误差

mse_train_2 = np.mean((y_train_pred_2 - y_train) ** 2)

mse_test_2 = np.mean((y_test_pred_2 - y_test) ** 2)

# 第三种方法 - 特征处理 + Gram-Schmidt方法

# 对原始数据矩阵进行处理

B_train = np.sin(X_train)

B_test = np.sin(X_test)

# 对处理后的数据矩阵进行Gram-Schmidt正交化处理

Q_train_3 = gram_schmidt(B_train)

Q_test_3 = gram_schmidt(B_test)

# 添加偏置列

Q_train_3 = np.hstack((np.ones((Q_train_3.shape[0], 1)), Q_train_3))

Q_test_3 = np.hstack((np.ones((Q_test_3.shape[0], 1)), Q_test_3))

# 计算最小二乘解

A_inv = np.linalg.inv(Q_train_3.T @ Q_train_3) @ Q_train_3.T

weights_3 = A_inv @ y_train

# 计算训练集和测试集上的预测值

y_train_pred_3 = Q_train_3 @ weights_3

y_test_pred_3 = Q_test_3 @ weights_3

# 计算训练集和测试集上的均方误差

mse_train_3 = np.mean((y_train_pred_3 - y_train) ** 2)

mse_test_3 = np.mean((y_test_pred_3 - y_test) ** 2)

# 第四种方法 - better Gram-Schmidt方法

def better_gram_schmidt(X):

    Q = []
    
    for a in X.T:
    
        u = a - sum(np.dot(a, q) * q for q in Q)
        
        if np.linalg.norm(u) > 1e-10:
        
            Q.append(u / np.linalg.norm(u))
    
    return np.array(Q).T

# 对原始数据矩阵B进行better Gram-Schmidt正交化处理

Q_train_4 = better_gram_schmidt(B_train)

Q_test_4 = better_gram_schmidt(B_test)

# 添加偏置列

Q_train_4 = np.hstack((np.ones((Q_train_4.shape[0], 1)), Q_train_4))

Q_test_4 = np.hstack((np.ones((Q_test_4.shape[0], 1)), Q_test_4))

# 计算最小二乘解

A_inv = np.linalg.inv(Q_train_4.T @ Q_train_4) @ Q_train_4.T

weights_4 = A_inv @ y_train

# 计算训练集和测试集上的预测值

y_train_pred_4 = Q_train_4 @ weights_4

y_test_pred_4 = Q_test_4 @ weights_4

# 计算训练集和测试集上的均方误差

mse_train_4 = np.mean((y_train_pred_4 - y_train) ** 2)

mse_test_4 = np.mean((y_test_pred_4 - y_test) ** 2)

# 图表显示结果

plt.bar(['Train1', 'Test1'], [mse_train_1, mse_test_1], label='Method 1')

plt.bar(['Train2', 'Test2'], [mse_train_2, mse_test_2], label='Method 2')

plt.bar(['Train3', 'Test3'], [mse_train_3, mse_test_3], label='Method 3')

plt.bar(['Train4', 'Test4'], [mse_train_4, mse_test_4], label='Method 4')

plt.xlabel('Dataset')

plt.ylabel('Mean Squared Error')

plt.title('Comparison of Methods')

plt.legend()

plt.show()

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import load_boston

from sklearn.model_selection import train_test_split

# 加载波士顿数据集

boston = load_boston()

X, y = boston.data, boston.target


# 将数据集划分为训练集和测试集

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 第一种方法 - 最小二乘方法


X_train_1 = np.hstack((np.ones((X_train.shape[0], 1)), X_train))

X_test_1 = np.hstack((np.ones((X_test.shape[0], 1)), X_test))



A_inv = np.linalg.inv(X_train_1.T @ X_train_1) @ X_train_1.T

weights_1 = A_inv @ y_train



y_train_pred_1 = X_train_1 @ weights_1

y_test_pred_1 = X_test_1 @ weights_1

# 计算训练集和测试集上的均方误差

mse_train_1 = np.mean((y_train_pred_1 - y_train) ** 2)

mse_test_1 = np.mean((y_test_pred_1 - y_test) ** 2)

# 第二种方法 - Gram-Schmidt方法

def gram_schmidt(X):
    Q, R = np.linalg.qr(X)

    return Q


# 对训练集进行正交化处理

X_train_2 = gram_schmidt(X_train)



X_train_2 = np.hstack((np.ones((X_train_2.shape[0], 1)), X_train_2))



A_inv = np.linalg.inv(X_train_2.T @ X_train_2) @ X_train_2.T

weights_2 = A_inv @ y_train



X_test_2 = gram_schmidt(X_test)

X_test_2 = np.hstack((np.ones((X_test_2.shape[0], 1)), X_test_2))



y_train_pred_2 = X_train_2 @ weights_2

y_test_pred_2 = X_test_2 @ weights_2

# 计算训练集和测试集上的均方误差

mse_train_2 = np.mean((y_train_pred_2 - y_train) ** 2)

mse_test_2 = np.mean((y_test_pred_2 - y_test) ** 2)

# 第三种方法 - 特征处理 + Gram-Schmidt方法

B_train = np.sin(X_train)

B_test = np.sin(X_test)



Q_train_3 = gram_schmidt(B_train)

Q_test_3 = gram_schmidt(B_test)



Q_train_3 = np.hstack((np.ones((Q_train_3.shape[0], 1)), Q_train_3))

Q_test_3 = np.hstack((np.ones((Q_test_3.shape[0], 1)), Q_test_3))



A_inv = np.linalg.inv(Q_train_3.T @ Q_train_3) @ Q_train_3.T

weights_3 = A_inv @ y_train



y_train_pred_3 = Q_train_3 @ weights_3

y_test_pred_3 = Q_test_3 @ weights_3

# 计算训练集和测试集上的均方误差

mse_train_3 = np.mean((y_train_pred_3 - y_train) ** 2)

mse_test_3 = np.mean((y_test_pred_3 - y_test) ** 2)

# 第四种方法 - better Gram-Schmidt方法

def better_gram_schmidt(X):
    Q = []

    for a in X.T:

        u = a - sum(np.dot(a, q) * q for q in Q)

        if np.linalg.norm(u) > 1e-10:
            Q.append(u / np.linalg.norm(u))

    return np.array(Q).T


Q_train_4 = better_gram_schmidt(B_train)

Q_test_4 = better_gram_schmidt(B_test)



Q_train_4 = np.hstack((np.ones((Q_train_4.shape[0], 1)), Q_train_4))

Q_test_4 = np.hstack((np.ones((Q_test_4.shape[0], 1)), Q_test_4))



A_inv = np.linalg.inv(Q_train_4.T @ Q_train_4) @ Q_train_4.T

weights_4 = A_inv @ y_train



y_train_pred_4 = Q_train_4 @ weights_4

y_test_pred_4 = Q_test_4 @ weights_4

# 计算训练集和测试集上的均方误差

mse_train_4 = np.mean((y_train_pred_4 - y_train) ** 2)

mse_test_4 = np.mean((y_test_pred_4 - y_test) ** 2)

# 可视化

plt.bar(['Train1', 'Test1'], [mse_train_1, mse_test_1], label='Model 1')

plt.bar(['Train2', 'Test2'], [mse_train_2, mse_test_2], label='Model 2')

plt.bar(['Train3', 'Test3'], [mse_train_3, mse_test_3], label='Model 3')

plt.bar(['Train4', 'Test4'], [mse_train_4, mse_test_4], label='Model 4')

plt.xlabel('Dataset')

plt.ylabel('Mean Squared Error')

plt.title('Comparison of Models')

plt.legend()

plt.show()