时间序列比赛(一)

发表于 2023-10-18 更新于 2023-11-07 Disqus：

尝试学习时间序列预测用它参加比赛

时间序列比赛(一)草稿

比赛内容

参数

评估指标: RMSE.

\[ R M S E=\sqrt{\frac{1}{\mathrm{n}} \sum_{\mathrm{i}=1}^{\mathrm{n}}\left(\mathrm{y}_{\mathrm{i}}-\mathrm{y}_{\mathrm{i}}^*\right)^2} \]
\(\mathrm{y}_1\) 式中为第个数据的真实值， \(\mathrm{y}_1^*\) 为第个数据的预测值， \(\mathrm{n}\) 为样本总数。

想法

观察数据（着重观察一些标准差奇怪的数据或者模型预测的均方误差很大的点）考虑是否剔除一些数据比如去年数据一些异常数据可以剔除或者插值

结合数据查找外部数据比如天气气温节假日信息

h3数据解码可以考虑解码成经纬度直接丢入模型里也可以总结成城市作为一个特征（同时如果有城市的拥堵数据也可以利用）

是按小时的预测然后相加预测天还是按天预测？

取决于你有没有小时特征的数据没有的话按天预测会更好

h3定位城市

import requests
#from bs4 import BeautifulSoup
import pandas as pd
#import matplotlib.pyplot as plt
#from bs4 import BeautifulSoup
#import seaborn as sn
import numpy as np
from h3 import h3
train_stub_info = pd.read_csv("D:\\2023“SEED”第四届江苏大数据开发与应用大赛--新能源赛道\\project\\data1\\train\\stub_info.csv")
test_stub_info = pd.read_csv("D:\\2023“SEED”第四届江苏大数据开发与应用大赛--新能源赛道\\project\\data1\\test\\stub_info.csv")
from h3 import h3
from geopy.geocoders import Nominatim
import re
city_list=[]
print(len(test_stub_info))
latitude_list, longitude_list=[],[]
for i in range(0,len(test_stub_info)):
# 获取经纬度信息
   
    latitude, longitude = h3.h3_to_geo(test_stub_info['h3'][i])
    latitude_list.append(latitude)
    longitude_list.append(longitude)
    # 创建地理编码器对象
    geolocator = Nominatim(user_agent="my-app")
    # 调用地理编码器对象的reverse方法
    location = geolocator.reverse(f"{latitude}, {longitude}")
    # 输出详细地址
    address = location.address # 字符类型
    # 江苏地级市一般是2-3个中文汉字: 南京市、苏州市、连云港市# 正则提取地级市
    city = re.findall(r',*?([\u4e00-\u9fa5]+市)',address, re.S)
    if i% 10 == 0:
        print(f"第{i}个循环 running",city)
    city_list.append(city[-1])
    

test_stub_info['latitude']=latitude_list
test_stub_info['longitude']=longitude_list
train_stub_info['latitude']=latitude_list
train_stub_info['longitude']=longitude_list
train_stub_info['city']=city_list
test_stub_info['city']=city_list
import pypinyin
def pin(x):
    return "".join(pypinyin.lazy_pinyin(x))
city_name=set(train_stub_info['city'].to_numpy())
city_name_dict={}
for i in city_name:
    city_name_dict[i]=pin(i)[:-3]

爬取天气的代码

import re
import  html5lib 
def Get_data(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
        'Accept': 'text/html;q=0.9,*/*;q=0.8',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
        'Accept-Encoding': 'gzip',
        'Connection': 'close'
    }
    # 请求得到网页内容
    req = requests.get(url,headers=headers)
    # 格式化网页
    soup=BeautifulSoup(req.text,"lxml")
    # 使用soup对象findl所需内容
    tables = soup.find(name='table')
    # 提取需要的数据
    data=pd.read_html(str(tables),encoding='utf-8')[0]

    return data
def data_process(datac,c):
    datac.columns=datac.values.tolist()[0];
    datac.drop(0,inplace=True);
    datac['ds'] = datac['日期'].apply(lambda x: int("".join(re.split("年|月|日",x))));
    datac['ltmp'] = datac['最低气温/最高气温'].apply(lambda x: int(re.split("℃|/|日",x)[0]));
    datac['htmp'] = datac['最低气温/最高气温'].apply(lambda x: int(re.split("℃|/|日",x)[2]));
    datac['city']=c;
    datac['weather1']=datac['天气状况'].apply(lambda x: re.split("/",x)[0]);
    datac['weather2']=datac['天气状况'].apply(lambda x: re.split("/",x)[1]);
    
    return datac

result=[]
for i in city_name:
    for j in train_day:
        create_url = 'http://www.tianqihoubao.com/lishi/'+city_name_dict[i]+'/month/'+j+'.html'.format(str(i))
        print('正在打开网页：',create_url,i,j)
        # 获取数据
        data = Get_data(create_url)
       
        # 数据处理
        data = data_process(data,i)
        print(data)
        namedata=i+j
        data.to_csv('cityweather/'+namedata+'.csv',index='False',encoding='gbk')
        result.append(data)
        #print(result[-1])


data_all = pd.concat(result,axis=0)
print(data_all)
# 输出到本地
data_all.to_csv('cityweather.csv',index='False',encoding='gbk')

数据清理以及预处理

#import 相关库
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import tqdm
import sys
import os
import gc
import argparse
import warnings

# 读取数据
train_power_forecast_history = pd.read_csv('./data1/train/power_forecast_history.csv',encoding = 'GB18030')
train_power = pd.read_csv('./data1/train/power.csv',encoding = 'GB18030')
#train_stub_info = pd.read_csv('./data1/train/stub_info.csv')
train_stub_info = pd.read_csv('./data1/train_stub_info.csv',encoding = 'GB18030')

test_power_forecast_history = pd.read_csv('./data1/test/power_forecast_history.csv',encoding = 'GB18030')
#test_stub_info = pd.read_csv('./data1/test/stub_info.csv')
test_stub_info = pd.read_csv('./data1/test_stub_info.csv',encoding = 'GB18030')
cityweather=pd.read_csv('./data1/cityweather.csv',encoding = 'GB18030')
del test_stub_info["Unnamed: 0"]
del train_stub_info["Unnamed: 0"]
del cityweather["Unnamed: 0"]
del cityweather["日期"]
del cityweather["天气状况"]
# 聚合数据
train_df = train_power_forecast_history.groupby(['id_encode','ds']).head(1)
del train_df['hour']

test_df = test_power_forecast_history.groupby(['id_encode','ds']).head(1)
del test_df['hour']

tmp_df = train_power.groupby(['id_encode','ds'])['power'].sum()
tmp_df.columns = ['id_encode','ds','power']

# 合并充电量数据
train_df = train_df.merge(tmp_df, on=['id_encode','ds'], how='left')



### 合并数据
train_df = train_df.merge(train_stub_info, on='id_encode', how='left')



test_df = test_df.merge(test_stub_info, on='id_encode', how='left')

del train_df['address']
del train_df['center']
del train_df['h3']
del test_df['address']
del test_df['center']
del test_df['h3']
city1=train_df['city'].tolist()
ds1=train_df['ds'].tolist()
ltmp=[]
htmp=[]
weather1=[]
weather2=[]
for i in zip(city1,ds1):
    #a=cityweather[cityweather['city']==i[0]]
    #print(i)
    b=cityweather[cityweather['ds']==i[1]]
    b= b[b['city']==i[0]]
    ltmp.append(b['ltmp'].to_list()[0])
    htmp.append(b['htmp'].to_list()[0]) 
    weather1.append(b['weather1'].to_list()[0])
    weather2.append(b['weather2'].to_list()[0]) 

train_df['ltmp']=ltmp
train_df['htmp']=htmp
train_df['weather1']=weather1
train_df['weather2']=weather2
city2=test_df['city'].tolist()
ds2=test_df['ds'].tolist()
ltmp2=[]
htmp2=[]
weather1=[]
weather2=[]
for i in zip(city2,ds2):
    b=cityweather[cityweather['ds']==i[1]]
    b= b[b['city']==i[0]]
    ltmp2.append(b['ltmp'].to_list()[0])
    htmp2.append(b['htmp'].to_list()[0])
    weather1.append(b['weather1'].to_list()[0])
    weather2.append(b['weather2'].to_list()[0]) 
test_df['ltmp']=ltmp2
test_df['htmp']=htmp2
test_df['weather1']=weather1
test_df['weather2']=weather2
train_df['flag'] = train_df['flag'].map({'A':0,'B':1})
test_df['flag'] = test_df['flag'].map({'A':0,'B':1})
lcity=list(set(train_df['city']))
citymap={}
for i in range(0,len(lcity)):
    citymap[lcity[i]]=i
    
lweather=list(set(train_df['weather1'].tolist()+train_df['weather2'].tolist()))
weathermap={}
for i in range(0,len(lweather)):
    weathermap[lweather[i]]=i
train_df['city'] = train_df['city'].map(citymap)
test_df['city'] = test_df['city'].map(citymap)
train_df['weatherA'] = train_df['weather1'].map(weathermap)
test_df['weatherA'] = test_df['weather1'].map(weathermap)
train_df['weatherB'] = train_df['weather2'].map(weathermap)
test_df['weatherB'] = test_df['weather2'].map(weathermap)
def get_time_feature(df, col):
    
    df_copy = df.copy()
    prefix = col + "_"
    df_copy['new_'+col] = df_copy[col].astype(str)
    
    col = 'new_'+col
    df_copy[col] = pd.to_datetime(df_copy[col], format='%Y%m%d')
    df_copy[prefix + 'year'] = df_copy[col].dt.year
    df_copy[prefix + 'month'] = df_copy[col].dt.month
    df_copy[prefix + 'day'] = df_copy[col].dt.day
    # df_copy[prefix + 'weekofyear'] = df_copy[col].dt.weekofyear
    df_copy[prefix + 'dayofweek'] = df_copy[col].dt.dayofweek
    df_copy[prefix + 'is_wknd'] = df_copy[col].dt.dayofweek // 6
    df_copy[prefix + 'quarter'] = df_copy[col].dt.quarter
    df_copy[prefix + 'is_month_start'] = df_copy[col].dt.is_month_start.astype(int)
    df_copy[prefix + 'is_month_end'] = df_copy[col].dt.is_month_end.astype(int)
    del df_copy[col]
    
    return df_copy   
    
train_df = get_time_feature(train_df, 'ds')
test_df = get_time_feature(test_df, 'ds')
del train_df['weather1']
del train_df['weather2']
del test_df['weather1']
del test_df['weather2']
train_df.to_csv('train_df.csv',index='False',encoding='gbk')
test_df.to_csv('test_df.csv',index='False',encoding='gbk')

#通过附近的值填充缺失值
def fillmy(x,y):
    

    for i in range(0,len(y)):
       
   
       
        id1= y['id_encode'][i]
        if  np.isnan(y[x][i]):#if y[x][i]!=0.0 and y[x][i]!=np.NaN:
            
            if i-1>0:
               
                if not(np.isnan(y[x][i-1])) and id1==y['id_encode'][i-1]:
                   
                    y[x][i]= y[x][i-1]
                    continue
            if i+1<len(y) :
              
                if not(np.isnan(y[x][i+1])) and id1==y['id_encode'][i+1]:
                   
                    y[x][i]= y[x][i+1]
                    continue
        if  np.isnan(y[x][i]):
            y[x][i]=np.mean([t for t in  y[x][y['id_encode'] == id1] if not np.isnan(i)])
        #print(id1, " ",y[x][i])
#显示缺失值
train_null_index=[i for i in (train_df.columns).tolist() if sum(train_df[i].isnull())>0]
[(i,sum(train_df[i].isnull())) for i in (train_df.columns).tolist()]
#显示缺失值
test_null_index=[i for i in (test_df.columns).tolist() if sum(test_df[i].isnull())>0]
[(i,sum(test_df[i].isnull())) for i in (test_df.columns).tolist()]
[fillmy(i,train_df) for i in train_null_index]
[fillmy(i,test_df) for i in test_null_index]
train_df.to_csv('train_df_fill.csv',index='False',encoding='gbk')
test_df.to_csv('test_df_fill.csv',index='False',encoding='gbk')

模型训练

# 使用K折交叉验证训练和验证模型
def cv_model(clf, train_x, train_y, test_x,params, folds = 3,n_iter=3000,seed=2023):
    # 定义折数并初始化KFold
    #folds = 3
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    
    # 初始化oof预测和测试集预测
    oof = np.zeros(train_x.shape[0])
    test_predict = np.zeros(test_x.shape[0])
    cv_scores = []
    cv_scores2 = []
    # KFold交叉验证
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        
        # 转换数据为lightgbm数据格式
        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        # 定义lightgbm参数
        
        # 训练模型
        model = clf.train(params, train_matrix, n_iter, valid_sets=[train_matrix, valid_matrix], categorical_feature=[])
        
        # 获取验证和测试集的预测值
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        
        oof[valid_index] = val_pred
        test_predict += test_pred / kf.n_splits
        
        # 计算并打印当前折的分数
        score = np.sqrt(mean_squared_error(val_pred, val_y))
        cv_scores.append(score)
        
       
        score2 = np.sqrt(mean_squared_error(train_pred, trn_y))
        cv_scores2.append(score2)
        
        print(cv_scores)
        print(cv_scores2)
        
    return oof, test_predict ,model

# 调用上面的函数进行模型训练和预测

params = {
            'boosting_type': 'dart',#'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            'min_child_weight': 5,
            'num_leaves': 2 ** 8,
            'lambda_l2': 10,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 4,
            'learning_rate': 0.1,
            'seed': 2023,
            'nthread' : 16,
            'verbose' : -1,
          'drop_rate':0.8,
        'skip_drop':0.7,
    'max_drop':5,
    'uniform_drop':False,
    'xgboost_dart_mode':True,
    'drop_seed':4,
            'device':'gpu'
        }

cols = [f for f in test_df.columns if f not in ['ds','power','h3','f3','ser_price','newf1','newf2']];
lgb_oof, lgb_test = cv_model(lgb, train_df[cols], train_df['power'], test_df[cols],params)

1
2
3

test_df['power'] = lgb_test
test_df['power'] = test_df['power'].apply(lambda x: 0 if x<0 else x)
test_df[['id_encode','ds','power']].to_csv('result.csv', index=False)

搜寻参数的代码

def median_absolute_percentage_error(y_true,y_pred):
    return np.median(np.abs((y_pred-y_true)/y_true))
def RMSE(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))
from sklearn.model_selection import RandomizedSearchCV
import time
from sklearn.metrics import make_scorer

neg_median_absolute_percentage_error=make_scorer(RMSE,#median_absolute_percentage_error
                                                 greater_is_better=False)
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),'开始RandomizedSearchCV')
model_lgb = lgb.LGBMRegressor(objective='regression',#'regression_l1',
                              metric= 'rmse',#'quantile',
                              #learning_rate=0.05,
                               bagging_freq=4,boosting_type= 'dart',verbose= -1,
#            'drop_rate':0.9,
#         'skip_drop':0.7,
#     'max_drop':5,
    uniform_drop=False,
    xgboost_dart_mode=True,
    drop_seed=4,
            device='gpu')


                             # subsample_freq = 5)
param_distributions={
    'max_depth':[5,6,7,8,9,10],
    'num_leaves':range(20,60,3),
    #'subsample': [0.7,0.8,0.9],
    #'colsample_bytree': [0.7,0.8,0.9],
    #'reg_alpha': [0, 0.001, 0.01, 0.03, 0.08, 0.3],
    #'reg_lambda': [0, 0.001, 0.01, 0.03, 0.08, 0.3]
    'learning_rate':[0.1,0.05,0.01],
    'min_child_weight': [5,10,20,30,40,50,60,70,80,150],
    'lambda_l2': range(20,100,3),
    'feature_fraction': [0.5,0.6,0.7,0.8,0.9],
            'bagging_fraction': [0.5,0.6,0.7,0.8,0.9],
    'drop_rate':np.arange(0.1,1,0.1),
    'skip_drop':np.arange(0.1,1,0.1),
      'max_drop':[5,10,15,30,100],
}
search = RandomizedSearchCV(estimator=model_lgb,
                            n_iter=500,
                            param_distributions=param_distributions,
                            scoring=neg_median_absolute_percentage_error,#neg_mean_absolute_percentage_error,
                            cv=3,
                            verbose=1,
                            n_jobs=-1,
                            random_state=0)
cols = [f for f in test_df.columns if f not in ['ds','power','h3','f3','ser_price','newf1','newf2']];
X,y=train_df[cols], train_df['power']
search.fit(X, y)
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),'完成RandomizedSearchCV')
print('Best parameters found by grid search are:', search.best_params_)

来自别人的笔记

#相对时间差特征
df['diff_from_start'] = (df_copy[col] - df_copy[col].iloc[0]).dt.days
df['diff_to_end'] = (df_copy[col].iloc[-1] - df_copy[col]).dt.days

构建了历史平移特征、差分特征、和窗口统计特征
（1）历史平移特征：通过历史平移获取上个阶段的信息；

（2）差分特征：可以帮助获取相邻阶段的增长差异，描述数据的涨减变化情况。在此基础上还可以构建相邻数据比值变化、二阶差分等；

（3）窗口统计特征：窗口统计可以构建不同的窗口大小，然后基于窗口范围进统计均值、最大值、最小值、中位数、方差的信息，可以反映最近阶段数据的变化情况。
————————————————
版权声明：本文为CSDN博主「Unicornlyy」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.csdn.net/m0_68165821/article/details/133887643

# 合并训练数据和测试数据
df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

# 历史平移
for i in range(7,36):
    df[f'power_shift{i}'] = df.groupby('id_encode')['power'].shift(i)
    window_size = 7


# 历史平移 + 差分特征
for i in range(1,4):
    df[f'power_shift7_diff{i}'] = df.groupby('id_encode')['power_shift7'].diff(i)
    
# 窗口统计
for win in [7,14,28,35,50,70]:
    df[f'power_win{win}_mean'] = df.groupby('id_encode')['power'].rolling(window=win, min_periods=3, closed='left').mean().values
    df[f'power_win{win}_median'] = df.groupby('id_encode')['power'].rolling(window=win, min_periods=3, closed='left').median().values
    df[f'power_win{win}_max'] = df.groupby('id_encode')['power'].rolling(window=win, min_periods=3, closed='left').max().values
    df[f'power_win{win}_min'] = df.groupby('id_encode')['power'].rolling(window=win, min_periods=3, closed='left').min().values
    df[f'power_win{win}_std'] = df.groupby('id_encode')['power'].rolling(window=win, min_periods=3, closed='left').std().values

# 历史平移 + 窗口统计
for win in [7,14,28,35,50,70]:
    df[f'power_shift7_win{win}_mean'] = df.groupby('id_encode')['power_shift7'].rolling(window=win, min_periods=3, closed='left').mean().values
    df[f'power_win{win}_median'] = df.groupby('id_encode')['power'].rolling(window=win, min_periods=3, closed='left').median().values
    df[f'power_shift7_win{win}_max'] = df.groupby('id_encode')['power_shift7'].rolling(window=win, min_periods=3, closed='left').max().values
    df[f'power_shift7_win{win}_min'] = df.groupby('id_encode')['power_shift7'].rolling(window=win, min_periods=3, closed='left').min().values
    df[f'power_shift7_win{win}_sum'] = df.groupby('id_encode')['power_shift7'].rolling(window=win, min_periods=3, closed='left').sum().values
    df[f'power_shift7_win{win}_std'] = df.groupby('id_encode')['power_shift7'].rolling(window=win, min_periods=3, closed='left').std().values

############################ 绘图函数
def my_plot(df, id_encode, start_date, end_date, predict=False): 
    # 绘制折线图
    fig = plt.figure(figsize=(20,10))
    df = df.loc[(df['ds'] >= start_date) & (df['ds'] <= end_date)]
    if id_encode > -1:
        df = df.loc[df['id_encode'] == id_encode]
    else:
        pass

    df_power = df.groupby(by = 'ds')['power'].sum().reset_index()
    plt.plot(pd.to_datetime(df_power['ds'], format='%Y%m%d'), df_power['power'], color = 'blue')
    if predict == True:
        df_power_pre = df.groupby(by = 'ds')['power_pre'].sum().reset_index()
        plt.plot(pd.to_datetime(df_power_pre['ds'], format='%Y%m%d'), df_power_pre['power_pre'], color = 'red')
            
    # 添加标题和轴标签
    plt.title('Power vs Date')
    plt.xlabel('Date')
    plt.ylabel('Power')

    # 显示图形
    plt.show()

LightGBM

数据预处理

LightGBM 针对样本多的问题提出了基于梯度的单边采样算法（Gradient-based One-Side Sampling，GOSS）；针对特征多的问题提出了互斥特征捆绑算法（Exclusive Feature Bundling，EFB）

GOSS 处理大梯度数据在加上随机抽样得到的梯度的数据

EFB

对于稀疏的特征合并（合并的特征相应的偏移一定量相加）减少特征降低维度

直方图加速运算

多机并行

选举并行

每个 worker 拥有一部分数据的全部特征，它们输出各自的局部直方图，然后汇总成全局直方图，在全局直方图上找出最优分裂点。