机器学习-回归问题

普通的线性回归,在计算总样本误差即损失值时,对所有训练样本一视同仁,因此极少数”坏”样本会使得预测模型偏离于大多数好样本所遵循的规则,影响模型的预测精度。

线性回归

import sklearn.linear_model as lm
创建学习模型对象:model=lm.LinearRegression()
训练学习模型对象:model.fit(x, y) # [x, y]-BGD->[w0, w1]
预测给定输入的输出:pred_y = model.predict(pred_x)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pickle
import numpy as np
import sklearn.linear_model as lm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/single.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
# 创建线性回归器
model = lm.LinearRegression()
# 训练线性回归器
model.fit(x, y) # 根据梯度下降算法寻找最优的模型参数
# 测试线性回归器
pred_y = model.predict(x)
for train, pred in zip(y, pred_y):
print(train, '->', pred)
# 平均绝对值误差:mean(|y-y'|)
print(sm.mean_absolute_error(y, pred_y))
# 平均平方误差:mean((y-y')^2)
print(sm.mean_squared_error(y, pred_y))
# 中位数绝对值误差:median(|y-y'|)
print(sm.median_absolute_error(y, pred_y))
# 协方差误差分值:[-1, 1]
print(sm.explained_variance_score(y, pred_y))
# R2分值:综合以上所有指标得到的综合评价,[0, 1]
print(sm.r2_score(y, pred_y))
# 保存训练好的模型
with open('../data/linear.pkl', 'wb') as f:
pickle.dump(model, f)
mp.figure('Linear Regression', facecolor='lightgray')
mp.title('Linear Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.scatter(x, y, c='dodgerblue', alpha=0.75,
s=60, label='Sample')
sorted_indices = x.T[0].argsort()
mp.plot(x[sorted_indices], pred_y[sorted_indices],
c='orangered', label='Regression')
mp.legend()
mp.show()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pickle
import numpy as np
import sklearn.linear_model as lm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/single.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
# 从文件中加载模型
with open('../data/linear.pkl', 'rb') as f:
model = pickle.load(f)
# 测试线性回归器
pred_y = model.predict(x)
for train, pred in zip(y, pred_y):
print(train, '->', pred)
# 平均绝对值误差:mean(|y-y'|)
print(sm.mean_absolute_error(y, pred_y))
# 平均平方误差:mean((y-y')^2)
print(sm.mean_squared_error(y, pred_y))
# 中位数绝对值误差:median(|y-y'|)
print(sm.median_absolute_error(y, pred_y))
# 协方差误差分值:[-1, 1]
print(sm.explained_variance_score(y, pred_y))
# R2分值:综合以上所有指标得到的综合评价,[0, 1]
print(sm.r2_score(y, pred_y))
mp.figure('Linear Regression', facecolor='lightgray')
mp.title('Linear Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.scatter(x, y, c='dodgerblue', alpha=0.75,
s=60, label='Sample')
sorted_indices = x.T[0].argsort()
mp.plot(x[sorted_indices], pred_y[sorted_indices],
c='orangered', label='Regression')
mp.legend()
mp.show()

岭回归

普通的线性回归,在计算总样本误差即损失值时,对所有训练样本一视同仁,因此极少数”坏”样本会使得预测模型偏离于大多数好样本所遵循的规则,影响模型的预测精度。岭回归就是在线性回归的基础之上,为每个训练样本分配不同的权重,越是能够反应一般规律的大多数好样本所得到的权重越大,而极少数偏离于一般规律的坏样本则只能获得较低的权重,从而使得最终的预测模型尽可能偏向于多数好样本,而弱化少数坏样本对模型的影响。
因此:超参数,人为给定
model = lm.Ridge(正则强度/惩罚力度)
正则强度/惩罚力度:[0, oo)
正则强度越小,权重差异就越小,0表示无差异,等同线性回归

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import numpy as np
import sklearn.linear_model as lm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/abnormal.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
# 创建线性回归器
model1 = lm.LinearRegression()
# 训练线性回归器
model1.fit(x, y) # 根据梯度下降算法寻找最优的模型参数
# 测试线性回归器
pred_y1 = model1.predict(x)
# 线性回归的R2分值
print(sm.r2_score(y, pred_y1))
# 创建岭回归器
model2 = lm.Ridge(250)
# 训练岭回归器
model2.fit(x, y) # 通过差异化权重削弱异常样本的影响
# 测试岭回归器
pred_y2 = model2.predict(x)
# 岭回归的R2分值
print(sm.r2_score(y, pred_y2))
mp.figure('Linear & Ridge Regression', facecolor='lightgray')
mp.title('Linear & Ridge Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.scatter(x, y, c='dodgerblue', alpha=0.75,
s=60, label='Sample')
sorted_indices = x.T[0].argsort()
mp.plot(x[sorted_indices], pred_y1[sorted_indices],
c='orangered', label='Linear')
mp.plot(x[sorted_indices], pred_y2[sorted_indices],
c='limegreen', label='Ridge')
mp.legend()
mp.show()

多项式回归

$y = w_1 + w_1x + w_2x^2 + w_3x^3 + … + w_nx^n$
$loss = Loss(w_0, w_1, …, w_n)$
$y = w_0 + w_1 \times 1 + w_2 \times 2 + w_3 \times 3 + … + w_n \times n$
$x_1 -> x_1, x_2, x_3, …, x_n$
$\Downarrow$
x1->多项式特征扩展-x1,x2,x3,…,xn->线性回归->w0~wn
$\Downarrow$
管线

import sklearn.pipeline as pl
import sklearn.preprocessing as sp
多项式特征扩展器=sp.PolynomialFeatures(n=最高次幂)
线性回归器=lm.LinearRegression()
管线模型=pl.make_pipeline(多项式特征扩展器,线性回归器)
管线模型.fit(x,y) # [x,y]-BGD->[w0,w1,w2,w3,…,wn]
管线模型.predict(x)->pred_y
**欠拟合**:过于简单的模型,或者训练集的规模过小,导致模型无法真实地反应输入和输出之间的规律,出现训练集和测试集的评估分值都比较低的现象。可以通过增加模型的复杂度,或者增加训练集的规模,提高模型的拟合度,优化其性能。
**过拟合**:过于复杂的模型,或者特征数过多,大致模型失去足够的一般性,即太过于倾向训练数据,反而对训练集以外的其它样本的预测性能大幅下降。可以减少特征数,或者降低模型的复杂度,在训练集和测试集的拟合程度上寻求一个折衷,提高模型的泛化能力。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import numpy as np
import sklearn.pipeline as pl
import sklearn.preprocessing as sp
import sklearn.linear_model as lm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
train_x, train_y = [], []
with open('../data/single.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
train_x.append(data[:-1])
train_y.append(data[-1])
train_x = np.array(train_x)
train_y = np.array(train_y)
model = pl.make_pipeline(sp.PolynomialFeatures(10),
lm.LinearRegression())
model.fit(train_x, train_y)
pred_train_y = model.predict(train_x)
print(sm.r2_score(train_y, pred_train_y))
test_x = np.linspace(train_x.min(),
train_x.max(), 1000)[:, np.newaxis]
pred_test_y = model.predict(test_x)
mp.figure('Polynomial Regression',
facecolor='lightgray')
mp.title('Polynomial Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.scatter(train_x, train_y, c='dodgerblue',
alpha=0.75, s=60, label='Sample')
mp.plot(test_x, pred_test_y, c='orangered',
label='Regression')
mp.legend()
mp.show()

决策树

既可用于解决回归问题,也可用于解决分类问题。

  1. 相似的输入必会产生相似的输出

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    年龄:0-青年,1-中年,2-老年
    学历:0-大专,1-大本,2-硕士,3-博士
    资历:0-小白,1-小牛,2-大牛,3-骨灰
    性别:0-女性,1-男性
    等级:0-低收入,1-中等收入,2-高收入

    年龄 学历 资历 性别 月薪 等级
    0 1 0 1 6000 0
    0 0 1 1 7000 1
    1 2 2 1 10000 2
    -->
    0 0 1 1 对输出取平均/对输出做投票
  2. 构建树状模型提高对相似输入的检索性能
    依次选取总样本空间中的每一个特征作为划分子表的依据,将样本矩阵划分为若干层级的多个子矩阵,每一个层级对应一个特征,组成树状结构。预测时,根据待预测样本的每个特征值,找到与之对应的叶级子表,将该子表的输出按照平均或者投票的方式计算预测值。

  3. 优先选择对输出影响最大的部分特征划分子表
    根据按照某个特征划分子表前后,其信息熵或基尼不纯度的减少量来判断该特征对输出的影响,信息熵或基尼不纯度减少量越大的特征,对输出的影响也越大,越应该优先作为子表划分的依据。

  4. 集成算法

    1. 自助聚合:每次从总样本空间中随机抽取一部分样本构建决策树,这样共构建B棵决策树
    2. 随机森林:每次从总样本空间中随机抽取一部分样本及特征构建决策树,这样共构建B棵决策树
    3. 正向激励:为样本空间中的每个样本分配初始权重,构建第一颗决策树,针对训练集中预测错误的样本,提升其权重,再构建第二棵决策树,以此类推,共构建B棵权重各不相同的决策树
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    import numpy as np
    import sklearn.datasets as sd
    import sklearn.utils as su
    import sklearn.tree as st
    import sklearn.ensemble as se
    import sklearn.metrics as sm
    boston = sd.load_boston()
    x, y = su.shuffle(boston.data, boston.target,
    random_state=7)
    train_size = int(len(x) * 0.8)
    train_x, test_x, train_y, test_y = \
    x[:train_size], x[train_size:], \
    y[:train_size], y[train_size:]
    # 决策树回归器
    model = st.DecisionTreeRegressor(max_depth=4)
    model.fit(train_x, train_y)
    pred_test_y = model.predict(test_x)
    print(sm.r2_score(test_y, pred_test_y))
    # 正向激励集成决策树回归器
    model = se.AdaBoostRegressor(
    st.DecisionTreeRegressor(max_depth=4),
    n_estimators=400, random_state=7)
    model.fit(train_x, train_y)
    pred_test_y = model.predict(test_x)
    print(sm.r2_score(test_y, pred_test_y))
  5. 特征重要性
    决策树模型在确定子表划分依据的过程中,会计算按照每个特征划分子表所引起的信息熵或基尼不纯度减少量,从业务上看该指标即体现了,每个特征对输出的影响力度。
    model = …
    model.fit(…)
    model.feature_importances_

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    import numpy as np
    import sklearn.datasets as sd
    import sklearn.utils as su
    import sklearn.tree as st
    import sklearn.ensemble as se
    import matplotlib.pyplot as mp
    boston = sd.load_boston()
    fn = boston.feature_names
    x, y = su.shuffle(boston.data, boston.target,
    random_state=7)
    train_size = int(len(x) * 0.8)
    train_x, test_x, train_y, test_y = \
    x[:train_size], x[train_size:], \
    y[:train_size], y[train_size:]
    # 正向激励集成决策树回归器
    model = se.AdaBoostRegressor(
    st.DecisionTreeRegressor(max_depth=4),
    n_estimators=400, random_state=7)
    model.fit(train_x, train_y)
    fi = model.feature_importances_
    for n, i in zip(fn, fi):
    print('{:>10} : {:.4f}'.format(n, i))
    mp.figure('Feature Importance', facecolor='lightgray')
    mp.title('Feature Importance', fontsize=20)
    mp.xlabel('Feature', fontsize=14)
    mp.ylabel('Importance', fontsize=14)
    mp.tick_params(labelsize=10)
    mp.grid(axis='y', linestyle=':')
    sorted_indices = fi.argsort()[::-1]
    pos = np.arange(sorted_indices.size)
    mp.bar(pos, fi[sorted_indices], facecolor='lightcoral',
    edgecolor='indianred')
    mp.xticks(pos, fn[sorted_indices], rotation=30)
    mp.show()

    特征重要性与模型的算法有关,还与数据的粒度有关。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    import csv
    import numpy as np
    import sklearn.utils as su
    import sklearn.ensemble as se
    import sklearn.metrics as sm
    import matplotlib.pyplot as mp
    with open('../data/bike_day.csv', 'r') as f:
    reader = csv.reader(f)
    x, y = [], []
    for row in reader:
    x.append(row[2:13])
    y.append(row[-1])
    fn_dy = np.array(x[0])
    x = np.array(x[1:], dtype=float)
    y = np.array(y[1:], dtype=float)
    x, y = su.shuffle(x, y, random_state=7)
    train_size = int(len(x) * 0.9)
    train_x, test_x, train_y, test_y = \
    x[:train_size], x[train_size:], \
    y[:train_size], y[train_size:]
    # 随机森林集成决策树回归器
    model = se.RandomForestRegressor(
    max_depth=10, n_estimators=1000,
    min_samples_split=2)
    model.fit(train_x, train_y)
    fi_dy = model.feature_importances_
    with open('../data/bike_hour.csv', 'r') as f:
    reader = csv.reader(f)
    x, y = [], []
    for row in reader:
    x.append(row[2:14])
    y.append(row[-1])
    fn_hr = np.array(x[0])
    x = np.array(x[1:], dtype=float)
    y = np.array(y[1:], dtype=float)
    x, y = su.shuffle(x, y, random_state=7)
    train_size = int(len(x) * 0.9)
    train_x, test_x, train_y, test_y = \
    x[:train_size], x[train_size:], \
    y[:train_size], y[train_size:]
    # 随机森林集成决策树回归器
    model = se.RandomForestRegressor(
    max_depth=10, n_estimators=1000,
    min_samples_split=2)
    model.fit(train_x, train_y)
    fi_hr = model.feature_importances_
    mp.figure('Bike', facecolor='lightgray')
    mp.subplot(211)
    mp.title('Day', fontsize=16)
    mp.ylabel('Importance', fontsize=12)
    mp.tick_params(labelsize=10)
    mp.grid(axis='y', linestyle=':')
    sorted_indices = fi_dy.argsort()[::-1]
    pos = np.arange(sorted_indices.size)
    mp.bar(pos, fi_dy[sorted_indices], facecolor='deepskyblue',
    edgecolor='steelblue')
    mp.xticks(pos, fn_dy[sorted_indices], rotation=30)
    mp.subplot(212)
    mp.title('Hour', fontsize=16)
    mp.xlabel('Feature', fontsize=12)
    mp.ylabel('Importance', fontsize=12)
    mp.tick_params(labelsize=10)
    mp.grid(axis='y', linestyle=':')
    sorted_indices = fi_hr.argsort()[::-1]
    pos = np.arange(sorted_indices.size)
    mp.bar(pos, fi_hr[sorted_indices], facecolor='lightcoral',
    edgecolor='indianred')
    mp.xticks(pos, fn_hr[sorted_indices], rotation=30)
    mp.tight_layout()
    mp.show()

对于回归问题,模型关注的是回归曲线,该曲线反映了输入数据和输出数据之间的函数关系。
对于分类问题,模型关注的是分类边界,边界线反映了不同类别之间的划分依据。