机器学习 1 linear regression 作业

话说学机器学习，不写代码就太扯淡了。好了，接着上一次的线性回归作业。

hw1作业的链接在这： http://speech.ee.ntu.edu.tw/~tlkagk/courses/ML_2016/Lecture/hw1.pdf

作业是预测台湾的PM2.5的指数，既然是回归问题，肯定是用的是上一节课的线性回归了。

以上数据我传到https://pan.baidu.com/s/1dFhwT13 上面了，供有兴趣的人做做。

实际上上述中分为训练数据和测试数据，都是CSV格式的，而且只用到PM2.5有用，其他的没什么用，同时通过测试数据才知道，

其实就是用前9个小时的PM2.5数据作为特征，来预测第10个小时的数据，将第10个小时的数据保存为csv格式，作为预测结果。

好了，不多说，上代码。我的开发环境还是win7+pycharm4.0

第一步，读取train.csv. 获取PM2.5的训练数据，一共240个训练数据，将前9个小时的数据作为特征，将第10个小时的数据作为标签

 # -*- coding:UTF-8 -*-
 __author__ = 'tao' import csv
 import cv2
 import sys
 import numpy as np
 import math filename = 'F:/台湾机器学习/data/train.csv'
 ufilename = unicode(filename , "utf8") #这一块主要是因为汉字路径 也就是python调用open打开文件时，其路径必须要转换为utf-8格式
 list=[]
 result=[]
 row=0
 colum=0;
 with open(ufilename, 'r') as f:
     data = f.readlines()  #dat中所有字符串读入data
     for line in data:
         odom = line.split(',')        #将单个数据分隔开存好
         colum=len(odom)
         if 'PM2.5'in odom:
             lists= map(int, odom[3:12])#第三个开始开始数据  一直取9个数
             results= map(int, odom[12:13])#取第10个数
             list.append(lists)
             result.append(results)
             # print odom
         row=row+1 #print("原始数据是：{0}行 ：{1}列 的数据".format(row, colum))
 print("有{0}个训练数据".format(len(list)))

第二步，利用梯度下降来训练权值和偏置。

#y=w0*x0+w1*x1+w2*x2+w3*x3+w4*x4+w5*x5+w6*x6+w7*x7+w8*x8+b0
#alpha=0.0001
b_0=np.random.rand(1,1)
th_0 = np.random.rand(1,1);
th_1 = np.random.rand(1,1);
th_2 = np.random.rand(1,1);
th_3 = np.random.rand(1,1);
th_4=  np.random.rand(1,1);
th_5 = np.random.rand(1,1);
th_6 = np.random.rand(1,1);
th_7 = np.random.rand(1,1);
th_8 = np.random.rand(1,1);
for k in range(1000):
    length = len(list)
    jtheta = 0
    total = 0
    sum_total = 0
    for id in range(length):
        # print("当前序号{0}训练数据".format(id))
        xset= np.array(list[id]) #一行 X数值
        yset= np.array(result[id]) # 要估计值
        total = total + b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]- yset
        # print( "当前误差{0}".format(b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]- yset))
        tmpb0 = b_0  - alpha/length*(total)
        tmp0 = th_0  -  alpha/length*(total)*xset[0]
        tmp1 = th_1  -  alpha/length*(total)*xset[1]
        tmp2 = th_2  -  alpha/length*(total)*xset[2]
        tmp3 = th_3  -  alpha/length*(total)*xset[3]
        tmp4 = th_4  -  alpha/length*(total)*xset[4]
        tmp5 = th_5  -  alpha/length*(total)*xset[5]
        tmp6 = th_6  -  alpha/length*(total)*xset[6]
        tmp7 = th_7  -  alpha/length*(total)*xset[7]
        tmp8 = th_8  -  alpha/length*(total)*xset[8]
        b_0 = tmpb0
        th_0 = tmp0
        th_1 = tmp1
        th_2 = tmp2
        th_3 = tmp3
        th_4 = tmp4
        th_5 = tmp5
        th_6 = tmp6
        th_7 = tmp7
        th_8 = tmp8
        sum_total = sum_total + b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8] - yset
        jtheta_1 = 0.5 * length * math.pow(sum_total,2)
        comp = math.fabs(jtheta_1 - jtheta)
        if id==length-1:
                print "%10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f \n" %(comp,jtheta * dgree,b_0,th_0,th_1,th_2,th_3,th_4,th_5,th_6,th_7,th_8)
        jtheta = jtheta_1
#
print("-训练得到的权值如下--")
print " %10.5f %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f \n" %(b_0,th_0,th_1,th_2,th_3,th_4,th_5,th_6,th_7,th_8)

第三步，测试训练集。这个可以不需要，是我调试过程中看，对训练集的预测精度怎么样？

 #测试训练集
 for k in range(len(list)):
     xset = np.array(list[k])
     nptresult= np.array(result[k])
     # print(xset)
     # print("预测数据{0}".format( b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]))
     # print("真实数据{0}".format(nptresult))
     error= b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]-nptresult
     print("训练集的实际误差{0}".format(error))

第四步，运行测试集，并保存测试结果。

首先读取测试集的数据，和训练集一样

 #读取测试集数据
 testfilename = 'F:/台湾机器学习/data/test_X.csv'
 utestfilename = unicode(testfilename , "utf8") #这一块主要是因为汉字路径 也就是python调用open打开文件时，其路径必须要转换为utf-8格式
 testlist=[]
 testrow=0
 testcolum=0;
 with open(utestfilename, 'r') as f:
     data = f.readlines()  #dat中所有字符串读入data
     for line in data:
         odom = line.split(',')        #将单个数据分隔开存好
         colum=len(odom)
         if 'PM2.5'in odom:
             testlists= map(int, odom[2:11])#第三个开始开始数据  一直取9个数
             testlist.append(testlists)
             # print odom
         testrow=row+1 print("测试数据是：{0}行 ：{1}列 的数据".format(testrow, testcolum))
 print("有{0}个测试数据".format(len(testlist)))
 print(testlist)

保存预测结果到csv文件中：

 #输出最后的测试结果
 csvfile = file('d:\\csv_result.csv', 'wb')
 writer = csv.writer(csvfile)
 writer.writerow(['id', 'value'])
 for k in range(len(testlist)):
     id_list=[]
     xset = np.array(testlist[k])
     result= b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]
     int_result = int(result)
     if(int_result<0):
         int_result=0
     id_list = [('id_{0}'.format(k), '{0}'.format(int_result))]
     print(id_list)
     writer.writerows(id_list)
 csvfile.close()

完整的程序：

 # -*- coding:UTF-8 -*-
 __author__ = 'tao' import csv
 import cv2
 import sys
 import numpy as np
 import math filename = 'F:/台湾机器学习/data/train.csv'
 ufilename = unicode(filename , "utf8") #这一块主要是因为汉字路径 也就是python调用open打开文件时，其路径必须要转换为utf-8格式
 list=[]
 result=[]
 row=0
 colum=0;
 with open(ufilename, 'r') as f:
     data = f.readlines()  #dat中所有字符串读入data
     for line in data:
         odom = line.split(',')        #将单个数据分隔开存好
         colum=len(odom)
         if 'PM2.5'in odom:
             lists= map(int, odom[3:12])#第三个开始开始数据  一直取9个数
             results= map(int, odom[12:13])#取第10个数
             list.append(lists)
             result.append(results)
             # print odom
         row=row+1 #print("原始数据是：{0}行 ：{1}列 的数据".format(row, colum))
 print("有{0}个训练数据".format(len(list))) #y=w0*x0+w1*x1+w2*x2+w3*x3+w4*x4+w5*x5+w6*x6+w7*x7+w8*x8+b0
 # alpha=0.0001
 b_0=np.random.rand(1,1)
 th_0 = np.random.rand(1,1);
 th_1 = np.random.rand(1,1);
 th_2 = np.random.rand(1,1);
 th_3 = np.random.rand(1,1);
 th_4=  np.random.rand(1,1);
 th_5 = np.random.rand(1,1);
 th_6 = np.random.rand(1,1);
 th_7 = np.random.rand(1,1);
 th_8 = np.random.rand(1,1);
 for k in range(1000):
     length = len(list)
     jtheta = 0
     total = 0
     sum_total = 0
     for id in range(length):
         # print("当前序号{0}训练数据".format(id))
         xset= np.array(list[id]) #一行 X数值
         yset= np.array(result[id]) # 要估计值
         total = total + b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]- yset
         # print( "当前误差{0}".format(b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]- yset))
         tmpb0 = b_0  - alpha/length*(total)
         tmp0 = th_0  -  alpha/length*(total)*xset[0]
         tmp1 = th_1  -  alpha/length*(total)*xset[1]
         tmp2 = th_2  -  alpha/length*(total)*xset[2]
         tmp3 = th_3  -  alpha/length*(total)*xset[3]
         tmp4 = th_4  -  alpha/length*(total)*xset[4]
         tmp5 = th_5  -  alpha/length*(total)*xset[5]
         tmp6 = th_6  -  alpha/length*(total)*xset[6]
         tmp7 = th_7  -  alpha/length*(total)*xset[7]
         tmp8 = th_8  -  alpha/length*(total)*xset[8]
         b_0 = tmpb0
         th_0 = tmp0
         th_1 = tmp1
         th_2 = tmp2
         th_3 = tmp3
         th_4 = tmp4
         th_5 = tmp5
         th_6 = tmp6
         th_7 = tmp7
         th_8 = tmp8
         sum_total = sum_total + b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8] - yset
         jtheta_1 = 0.5 * length * math.pow(sum_total,2)
         comp = math.fabs(jtheta_1 - jtheta)
         if id==length-1:
                 print "%10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f \n" %(comp,jtheta * dgree,b_0,th_0,th_1,th_2,th_3,th_4,th_5,th_6,th_7,th_8)
         jtheta = jtheta_1
 #
 print("-训练得到的权值如下--")
 print " %10.5f %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f \n" %(b_0,th_0,th_1,th_2,th_3,th_4,th_5,th_6,th_7,th_8) #测试训练集
 for k in range(len(list)):
     xset = np.array(list[k])
     nptresult= np.array(result[k])
     # print(xset)
     # print("预测数据{0}".format( b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]))
     # print("真实数据{0}".format(nptresult))
     error= b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]-nptresult
     print("训练集的实际误差{0}".format(error)) #读取测试集数据
 testfilename = 'F:/台湾机器学习/data/test_X.csv'
 utestfilename = unicode(testfilename , "utf8") #这一块主要是因为汉字路径 也就是python调用open打开文件时，其路径必须要转换为utf-8格式
 testlist=[]
 testrow=0
 testcolum=0;
 with open(utestfilename, 'r') as f:
     data = f.readlines()  #dat中所有字符串读入data
     for line in data:
         odom = line.split(',')        #将单个数据分隔开存好
         colum=len(odom)
         if 'PM2.5'in odom:
             testlists= map(int, odom[2:11])#第三个开始开始数据  一直取9个数
             testlist.append(testlists)
             # print odom
         testrow=row+1 print("测试数据是：{0}行 ：{1}列 的数据".format(testrow, testcolum))
 print("有{0}个测试数据".format(len(testlist)))
 print(testlist) #输出最后的测试结果
 csvfile = file('d:\\csv_result.csv', 'wb')
 writer = csv.writer(csvfile)
 writer.writerow(['id', 'value'])
 for k in range(len(testlist)):
     id_list=[]
     xset = np.array(testlist[k])
     result= b_0 + th_0 * xset[0]+ th_1 * xset[1]+ th_2 * xset[2]+ th_3 * xset[3]+ th_4 * xset[4]+ th_5 * xset[5]+ th_6 * xset[6]+ th_7 * xset[7]+ th_8 * xset[8]
     int_result = int(result)
     if(int_result<0):
         int_result=0
     id_list = [('id_{0}'.format(k), '{0}'.format(int_result))]
     print(id_list)
     writer.writerows(id_list)
 csvfile.close()

又试了试 batch gradual descent，貌似没发现什么新的东西

#y=w0*x0+w1*x1+w2*x2+w3*x3+w4*x4+w5*x5+w6*x6+w7*x7+w8*x8+b0
#
alpha=0.0001
b_0=np.random.rand(1,1)
th = np.random.rand(1,9);
batch=20
for k in range(5000):
    length = len(list)
    jtheta = 0
    total = 0
    sum_total = 0
    count=0
    for j in range(batch): #batch
        # print("当前序号{0}训练数据".format(id))
        xset= np.array(list[j+count*batch]) #一行 X数值
        yset= np.array(result[j+count*batch]) # 要估计值
        total = total+b_0 +np.dot(th,xset)- yset
        # print( "当前误差{0}".format(b_0 +np.dot(th,xset)- yset))
    b_0 = b_0  - alpha/batch*(total)
    th = th  -  alpha/batch*(total)*xset
    count = count +1
    if(count>=len(list)/batch):
      break;
    if(j==batch-1):
        print " %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f \n" %(b_0,th[0][0],th[0][1],th[0][2],th[0][3],th[0][4],th[0][5],th[0][6],th[0][7],th[0][8])#
print("-训练得到的权值如下--")
print" %10.5f %10.5f  %10.5f %10.5f   %10.5f  %10.5f  %10.5f %10.5f   %10.5f  %10.5f \n" %(b_0,th[0][0],th[0][1],th[0][2],th[0][3],th[0][4],th[0][5],th[0][6],th[0][7],th[0][8])

个人收藏笔记记录

开通VIP