300字范文 > python-多元线性回归模型

python-多元线性回归模型

时间：2022-03-15 12:41:10

相关推荐

python-多元线性回归模型

只是python代码方便以后调用，理论部分需要看其他人的

回归模型的预测

import matplotlibimport matplotlib.pyplot as pltimport pandas as pdimport numpy as npimport seaborn as snsimport statsmodels.api as smfrom sklearn import model_selectionfrom scipy.stats import ffrom scipy.stats import normfont = {'family': 'FangSong','weight': 'bold','size': 12}matplotlib.rc("font", **font)Profit = pd.read_excel("../data/Predict to Profit.xlsx", names=list("abcde"))'''RD_Spend 49 non-null float64Administration49 non-null float64Marketing_Spend 49 non-null float64State 49 non-null objectProfit 49 non-null float64'''print(Profit.shape)# 将数据拆分成训练集和测试集train, test = model_selection.train_test_split(Profit, test_size=0.2, random_state=1234)# 根据train数据集建模model = sm.formula.ols('e ~ a+b+c+C(d)', data=train).fit()# 删除test集中的Profit变量，用剩下的自变量进行预测test_X = test.drop(labels='e', axis=1)pred = model.predict(exog=test_X)# 对比预测值和实际值的差异print(pd.DataFrame({'pred': pred,'real': test.e}))

模型的显著性检验 F检验

# 计算建模数据中因变量的均值ybar = train.e.mean()# 统计变量个数和观测个数p = model.df_model # 变量个数n = train.shape[0] # 观测个数# 计算回归离差平方和RSS = np.sum((model.fittedvalues - ybar) ** 2)# 计算误差平方和ESS = np.sum((train.e - model.fittedvalues) ** 2)# 计算F统计量的值F = (RSS/p)/(ESS/(n - p - 1))# 直接得到F统计量值F1 = model.fvalueprint(F)# 对比结果下结论# 计算F分布的理论值F_Theroy = f.ppf(q=0.95, dfn=p, dfd=n-p-1)print(F_Theroy)

回归系数的显著性检验 t检验

print(model.summary())

P>|t|的值小于0.05才有用

回归模型的诊断

①误差项ε服从正态分布

误差项服从正太分布，就是要求因变量服从正态分布

绘制直方图

sns.distplot(a=Profit.e, bins=10, norm_hist=True, fit=norm,hist_kws={'color': 'steelblue'},kde_kws={'color': 'black', 'linestyle': '--', 'label': '核密度图'},fit_kws={'color': 'red', 'linestyle': ':', 'label': '正态密度曲线'})plt.legend()# 显示图形plt.show()

②无多重共线性

关于多重共线性的检验可以使用方差膨胀因子VIF来鉴定，如果VIF大于10，则说明变量间存在多重共线性；如果VIF大于100,则表名变量间存在严重的多重共线性如果发现变量之间存在多重共线性的话，则可以考虑删除变量或重新选择模型

# 导入statsmodel模块函数from statsmodels.stats.outliers_influence import variance_inflation_factor# 自变量X(包含RD_Speed、Marketing_Speed和常数列1)X = sm.add_constant(Profit.ix[:, ['a', 'c']])# 构造空的数据框，用于存储VIF值vif = pd.DataFrame()vif['features'] = X.columnsvif['VIF Factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]print(vif)

③线性相关性

高度相关:|p| >= 0.8 中度相关:0.5 <= |p| <0.8弱相关:0.3 <= |p| < 0.5几乎不相关:|p| < 0.3相关性越大越好

# 计算数据集Profit中每个自变量与因变量利润之间的相关系数res = Profit.drop(labels=['e'], axis=1).corrwith(Profit.e)print(res)# 绘制散点图矩阵sns.pairplot(Profit.loc[:, ['a', 'b', 'c', 'e']])plt.show()

综合考虑相关系数、散点图矩阵和t检验的结果,最终确定只保留模型model中的a(RD_Speed)和c(Marketing_Speed)两个自变量重新对该模型做修正

model2 = sm.formula.ols('e ~ a + c', data=train).fit()# 模型回归系数的估计值 e = 51902.112471 + 0.79*a + 0.02*cprint(model2.params)

异常值检验

通常利用帽子矩阵、DFFITS准则、学生化残差或Cook距离进行异常点检测

outliers = model2.get_influence()# 高杠杆值点（帽子矩阵）leverage = outliers.hat_matrix_diag# diffits值dffits = outliers.dffits[0]# 学生化残差resid_stu = outliers.resid_studentized_external# cook距离cook = outliers.cooks_distance[0]# 合并各种异常值检验的统计量值contat1 = pd.concat([pd.Series(leverage, name='leverage'), pd.Series(dffits, name='dffits'),pd.Series(resid_stu, name='resid_stu'), pd.Series(cook, name='cook')], axis=1)# 重设train数据的行索引train.index = range(train.shape[0])# 将上面的统计量与train数据集合并profit_outliers = pd.concat([train, contat1], axis=1)print(profit_outliers.head())# 为了简单起见，这里使用标准化残差，当标准化残差大于2时，即认为对应的数据点为异常点outliers_ratio = sum(np.where((np.abs(profit_outliers.resid_stu) > 2), 1, 0))/profit_outliers.shape[0]print(outliers_ratio)# 异常比例不高，低于5%，可以考虑删除# 挑选非异常观测点none_outliers = profit_outliers.loc[np.abs(profit_outliers.resid_stu) <= 2, :]# 应用无异常值的数据集重新建模model3 = sm.formula.ols('e ~ a + c', data=none_outliers).fit()print(model3.params)

方差齐性检验

方差齐性是要求模型残差项的方差不随自变量的变动而呈现某种趋势，否则，残差的趋势就可以被自变量刻画。

# 设置第一张子图的位置ax1 = plt.subplot2grid(shape=(2, 1), loc=(0, 0))# 绘制散点图ax1.scatter(none_outliers.a, (model3.resid - model3.resid.mean())/model3.resid.std())# 添加水平参考线ax1.hlines(y=0, xmin=none_outliers.a.min(), xmax=none_outliers.a.max(), colors='red', linestyles='--')# 添加x轴和y轴标签ax1.set_xlabel('RD_Spend')ax1.set_ylabel('Std_Residual')# 设置第二张子图的位置ax2 = plt.subplot2grid(shape=(2, 1), loc=(1, 0))# 绘制散点图ax2.scatter(none_outliers.c, (model3.resid - model3.resid.mean())/model3.resid.std())# 添加水平参考线ax2.hlines(y=0, xmin=none_outliers.c.min(), xmax=none_outliers.c.max(), colors='red', linestyles='--')# 添加x轴和y轴标签ax2.set_xlabel('Marketing_Spend')ax2.set_ylabel('Std_Residual')# 调整子图之间的水平间距和高度间距plt.subplots_adjust(hspace=0.6, wspace=0.3)# 显示图形plt.show()# 回归模型的预测pred3 = model3.predict(exog=test.loc[:, ['a', 'c']])# 绘制预测值与实际值的散点图plt.scatter(x=test.e, y=pred3)# 添加斜率为1、截距项为0的参考线plt.plot([test.e.min(), test.e.max()], [test.e.min(), test.e.max()], color='red', linestyle='--')# 添加轴标签plt.xlabel('实际值')plt.ylabel('预测值')# 显示图形plt.show()

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。