{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "09ddd30a", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "id": "fb5dde4c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tanggaljumlah_penumpang_per_hariis_weekendis_holiday_natflag_contiguous_offflag_almost_contiguous_offevents
02024-01-01316601000
12024-01-02290100000
22024-01-03279500000
32024-01-04278400000
42024-01-05321900000
........................
6042025-08-27370200000
6052025-08-28341200000
6062025-08-29320100000
6072025-08-30240610000
6082025-08-31209110000
\n", "

609 rows × 7 columns

\n", "
" ], "text/plain": [ " tanggal jumlah_penumpang_per_hari is_weekend is_holiday_nat \\\n", "0 2024-01-01 3166 0 1 \n", "1 2024-01-02 2901 0 0 \n", "2 2024-01-03 2795 0 0 \n", "3 2024-01-04 2784 0 0 \n", "4 2024-01-05 3219 0 0 \n", ".. ... ... ... ... \n", "604 2025-08-27 3702 0 0 \n", "605 2025-08-28 3412 0 0 \n", "606 2025-08-29 3201 0 0 \n", "607 2025-08-30 2406 1 0 \n", "608 2025-08-31 2091 1 0 \n", "\n", " flag_contiguous_off flag_almost_contiguous_off events \n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", ".. ... ... ... \n", "604 0 0 0 \n", "605 0 0 0 \n", "606 0 0 0 \n", "607 0 0 0 \n", "608 0 0 0 \n", "\n", "[609 rows x 7 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('lrt_daily_events_no_leak.csv')\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "id": "2c081ee4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 609 entries, 0 to 608\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 tanggal 609 non-null object\n", " 1 jumlah_penumpang_per_hari 609 non-null int64 \n", " 2 is_weekend 609 non-null int64 \n", " 3 is_holiday_nat 609 non-null int64 \n", " 4 flag_contiguous_off 609 non-null int64 \n", " 5 flag_almost_contiguous_off 609 non-null int64 \n", " 6 events 609 non-null int64 \n", "dtypes: int64(6), object(1)\n", "memory usage: 33.4+ KB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 5, "id": "6ac50d54", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 609 entries, 0 to 608\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 tanggal 609 non-null datetime64[ns]\n", " 1 jumlah_penumpang_per_hari 609 non-null int64 \n", " 2 is_weekend 609 non-null int64 \n", " 3 is_holiday_nat 609 non-null int64 \n", " 4 flag_contiguous_off 609 non-null int64 \n", " 5 flag_almost_contiguous_off 609 non-null int64 \n", " 6 events 609 non-null int64 \n", "dtypes: datetime64[ns](1), int64(6)\n", "memory usage: 33.4 KB\n" ] } ], "source": [ "df['tanggal'] = pd.to_datetime(df['tanggal'])\n", "df.info()" ] }, { "cell_type": "code", "execution_count": 6, "id": "6a5218d6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
jumlah_penumpang_per_hariis_weekendis_holiday_natflag_contiguous_offflag_almost_contiguous_offevents
tanggal
2024-01-01316601000
2024-01-02290100000
2024-01-03279500000
2024-01-04278400000
2024-01-05321900000
\n", "
" ], "text/plain": [ " jumlah_penumpang_per_hari is_weekend is_holiday_nat \\\n", "tanggal \n", "2024-01-01 3166 0 1 \n", "2024-01-02 2901 0 0 \n", "2024-01-03 2795 0 0 \n", "2024-01-04 2784 0 0 \n", "2024-01-05 3219 0 0 \n", "\n", " flag_contiguous_off flag_almost_contiguous_off events \n", "tanggal \n", "2024-01-01 0 0 0 \n", "2024-01-02 0 0 0 \n", "2024-01-03 0 0 0 \n", "2024-01-04 0 0 0 \n", "2024-01-05 0 0 0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# set 'tanggal' as the dataframe index and sort by it\n", "df.set_index('tanggal', inplace=True)\n", "df.sort_index(inplace=True)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 8, "id": "be623133", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "jumlah_penumpang_per_hari 6546\n", "is_weekend 1\n", "is_holiday_nat 1\n", "flag_contiguous_off 0\n", "flag_almost_contiguous_off 0\n", "events 0\n", "Name: 2024-08-17 00:00:00, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# see df row on df['events'] = 2024-08-17\n", "df.loc['2024-08-17']" ] }, { "cell_type": "code", "execution_count": 6, "id": "2f41917c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
jumlah_penumpang_per_hariis_weekendis_holiday_natflag_contiguous_offflag_almost_contiguous_offevents
tanggal
2024-01-01316601000
2024-01-02290100000
2024-01-03279500000
2024-01-04278400000
2024-01-05321900000
\n", "
" ], "text/plain": [ " jumlah_penumpang_per_hari is_weekend is_holiday_nat \\\n", "tanggal \n", "2024-01-01 3166 0 1 \n", "2024-01-02 2901 0 0 \n", "2024-01-03 2795 0 0 \n", "2024-01-04 2784 0 0 \n", "2024-01-05 3219 0 0 \n", "\n", " flag_contiguous_off flag_almost_contiguous_off events \n", "tanggal \n", "2024-01-01 0 0 0 \n", "2024-01-02 0 0 0 \n", "2024-01-03 0 0 0 \n", "2024-01-04 0 0 0 \n", "2024-01-05 0 0 0 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
jumlah_penumpang_per_hariis_weekendis_holiday_natflag_contiguous_offflag_almost_contiguous_offevents
tanggal
2025-08-27370200000
2025-08-28341200000
2025-08-29320100000
2025-08-30240610000
2025-08-31209110000
\n", "
" ], "text/plain": [ " jumlah_penumpang_per_hari is_weekend is_holiday_nat \\\n", "tanggal \n", "2025-08-27 3702 0 0 \n", "2025-08-28 3412 0 0 \n", "2025-08-29 3201 0 0 \n", "2025-08-30 2406 1 0 \n", "2025-08-31 2091 1 0 \n", "\n", " flag_contiguous_off flag_almost_contiguous_off events \n", "tanggal \n", "2025-08-27 0 0 0 \n", "2025-08-28 0 0 0 \n", "2025-08-29 0 0 0 \n", "2025-08-30 0 0 0 \n", "2025-08-31 0 0 0 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_train = df.iloc[:-56]\n", "df_test = df.iloc[-56:]\n", "display(df_train.head())\n", "display(df_test.tail())" ] }, { "cell_type": "code", "execution_count": 7, "id": "a1b355cb", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Muhammad Hafiz F\\AppData\\Local\\Temp\\ipykernel_47060\\662747253.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train[f'lag_{lag}'] = df_train[y_col].shift(lag)\n", "C:\\Users\\Muhammad Hafiz F\\AppData\\Local\\Temp\\ipykernel_47060\\662747253.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train[f'lag_{lag}'] = df_train[y_col].shift(lag)\n", "C:\\Users\\Muhammad Hafiz F\\AppData\\Local\\Temp\\ipykernel_47060\\662747253.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train[f'lag_{lag}'] = df_train[y_col].shift(lag)\n", "C:\\Users\\Muhammad Hafiz F\\AppData\\Local\\Temp\\ipykernel_47060\\662747253.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train[f'lag_{lag}'] = df_train[y_col].shift(lag)\n", "C:\\Users\\Muhammad Hafiz F\\AppData\\Local\\Temp\\ipykernel_47060\\662747253.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train[f'lag_{lag}'] = df_train[y_col].shift(lag)\n", "C:\\Users\\Muhammad Hafiz F\\AppData\\Local\\Temp\\ipykernel_47060\\662747253.py:8: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train['roll_mean_7'] = df_train[y_col].rolling(window=7, min_periods=7).mean()\n", "C:\\Users\\Muhammad Hafiz F\\AppData\\Local\\Temp\\ipykernel_47060\\662747253.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train['roll_mean_14'] = df_train[y_col].rolling(window=14, min_periods=14).mean()\n", "C:\\Users\\Muhammad Hafiz F\\AppData\\Local\\Temp\\ipykernel_47060\\662747253.py:12: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train['roll_std_7'] = df_train[y_col].rolling(window=7, min_periods=7).std()\n", "C:\\Users\\Muhammad Hafiz F\\AppData\\Local\\Temp\\ipykernel_47060\\662747253.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train['diff_seasonal_7'] = df_train[y_col] - df_train[y_col].shift(7)\n" ] } ], "source": [ "y_col = 'jumlah_penumpang_per_hari'\n", "\n", "# 1. Lag features: 1, 2, 3, 7, 14\n", "for lag in [1, 2, 3, 7, 14]:\n", " df_train[f'lag_{lag}'] = df_train[y_col].shift(lag)\n", "\n", "# 2. Rolling means: 7-day & 14-day (past window only)\n", "df_train['roll_mean_7'] = df_train[y_col].rolling(window=7, min_periods=7).mean()\n", "df_train['roll_mean_14'] = df_train[y_col].rolling(window=14, min_periods=14).mean()\n", "\n", "# 3. Rolling std: 7-day\n", "df_train['roll_std_7'] = df_train[y_col].rolling(window=7, min_periods=7).std()\n", "\n", "# 4. Seasonal difference (lag 7)\n", "df_train['diff_seasonal_7'] = df_train[y_col] - df_train[y_col].shift(7)" ] }, { "cell_type": "code", "execution_count": 8, "id": "02566ee0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
jumlah_penumpang_per_hariis_weekendis_holiday_natflag_contiguous_offflag_almost_contiguous_offeventslag_1lag_2lag_3lag_7lag_14roll_mean_7roll_mean_14roll_std_7diff_seasonal_7
tanggal
2024-01-01316601000NaNNaNNaNNaNNaNNaNNaNNaNNaN
2024-01-022901000003166.0NaNNaNNaNNaNNaNNaNNaNNaN
2024-01-032795000002901.03166.0NaNNaNNaNNaNNaNNaNNaN
2024-01-042784000002795.02901.03166.0NaNNaNNaNNaNNaNNaN
2024-01-053219000002784.02795.02901.0NaNNaNNaNNaNNaNNaN
2024-01-062863100003219.02784.02795.0NaNNaNNaNNaNNaNNaN
2024-01-072405100002863.03219.02784.0NaNNaN2876.142857NaN270.619307NaN
2024-01-082959000002405.02863.03219.03166.0NaN2846.571429NaN243.630771-207.0
2024-01-093057000002959.02405.02863.02901.0NaN2868.857143NaN256.247555156.0
2024-01-102993000003057.02959.02405.02795.0NaN2897.142857NaN257.660260198.0
2024-01-112952000002993.03057.02959.02784.0NaN2921.142857NaN253.149777168.0
2024-01-123242000002952.02993.03057.03219.0NaN2924.428571NaN257.76724523.0
2024-01-133450100003242.02952.02993.02863.0NaN3008.285714NaN321.944982587.0
2024-01-142729100003450.03242.02952.02405.0NaN3054.5714292965.357143231.279670324.0
2024-01-153002000002729.03450.03242.02959.03166.03060.7142862953.642857228.87676043.0
2024-01-162944000003002.02729.03450.03057.02901.03044.5714292956.714286233.127903-113.0
2024-01-173402000002944.03002.02729.02993.02795.03103.0000003000.071429266.861387409.0
2024-01-183029000003402.02944.03002.02952.02784.03114.0000003017.571429261.12513077.0
2024-01-192835000003029.03402.02944.03242.03219.03055.8571432990.142857272.919664-407.0
2024-01-203268100002835.03029.03402.03450.02863.03029.8571433019.071429235.171872-182.0
2024-01-212830100003268.02835.03029.02729.02405.03044.2857143049.428571215.949619101.0
2024-01-222900000002830.03268.02835.03002.02959.03029.7142863045.214286222.616797-102.0
2024-01-232963000002900.02830.03268.02944.03057.03032.4285713038.500000221.51061519.0
2024-01-243645000002963.02900.02830.03402.02993.03067.1428573085.071429295.698849243.0
2024-01-253028000003645.02963.02900.03029.02952.03067.0000003090.500000295.720589-1.0
2024-01-263454000003028.03645.02963.02835.03242.03155.4285713105.642857307.113365619.0
2024-01-272896100003454.03028.03645.03268.03450.03102.2857143066.071429316.431517-372.0
2024-01-282533100002896.03454.03028.02830.02729.03059.8571433052.071429373.746540-297.0
2024-01-293031000002533.02896.03454.02900.03002.03078.5714293054.142857367.637891131.0
2024-01-303010000003031.02533.02896.02963.02944.03085.2857143058.857143365.59892247.0
\n", "
" ], "text/plain": [ " jumlah_penumpang_per_hari is_weekend is_holiday_nat \\\n", "tanggal \n", "2024-01-01 3166 0 1 \n", "2024-01-02 2901 0 0 \n", "2024-01-03 2795 0 0 \n", "2024-01-04 2784 0 0 \n", "2024-01-05 3219 0 0 \n", "2024-01-06 2863 1 0 \n", "2024-01-07 2405 1 0 \n", "2024-01-08 2959 0 0 \n", "2024-01-09 3057 0 0 \n", "2024-01-10 2993 0 0 \n", "2024-01-11 2952 0 0 \n", "2024-01-12 3242 0 0 \n", "2024-01-13 3450 1 0 \n", "2024-01-14 2729 1 0 \n", "2024-01-15 3002 0 0 \n", "2024-01-16 2944 0 0 \n", "2024-01-17 3402 0 0 \n", "2024-01-18 3029 0 0 \n", "2024-01-19 2835 0 0 \n", "2024-01-20 3268 1 0 \n", "2024-01-21 2830 1 0 \n", "2024-01-22 2900 0 0 \n", "2024-01-23 2963 0 0 \n", "2024-01-24 3645 0 0 \n", "2024-01-25 3028 0 0 \n", "2024-01-26 3454 0 0 \n", "2024-01-27 2896 1 0 \n", "2024-01-28 2533 1 0 \n", "2024-01-29 3031 0 0 \n", "2024-01-30 3010 0 0 \n", "\n", " flag_contiguous_off flag_almost_contiguous_off events lag_1 \\\n", "tanggal \n", "2024-01-01 0 0 0 NaN \n", "2024-01-02 0 0 0 3166.0 \n", "2024-01-03 0 0 0 2901.0 \n", "2024-01-04 0 0 0 2795.0 \n", "2024-01-05 0 0 0 2784.0 \n", "2024-01-06 0 0 0 3219.0 \n", "2024-01-07 0 0 0 2863.0 \n", "2024-01-08 0 0 0 2405.0 \n", "2024-01-09 0 0 0 2959.0 \n", "2024-01-10 0 0 0 3057.0 \n", "2024-01-11 0 0 0 2993.0 \n", "2024-01-12 0 0 0 2952.0 \n", "2024-01-13 0 0 0 3242.0 \n", "2024-01-14 0 0 0 3450.0 \n", "2024-01-15 0 0 0 2729.0 \n", "2024-01-16 0 0 0 3002.0 \n", "2024-01-17 0 0 0 2944.0 \n", "2024-01-18 0 0 0 3402.0 \n", "2024-01-19 0 0 0 3029.0 \n", "2024-01-20 0 0 0 2835.0 \n", "2024-01-21 0 0 0 3268.0 \n", "2024-01-22 0 0 0 2830.0 \n", "2024-01-23 0 0 0 2900.0 \n", "2024-01-24 0 0 0 2963.0 \n", "2024-01-25 0 0 0 3645.0 \n", "2024-01-26 0 0 0 3028.0 \n", "2024-01-27 0 0 0 3454.0 \n", "2024-01-28 0 0 0 2896.0 \n", "2024-01-29 0 0 0 2533.0 \n", "2024-01-30 0 0 0 3031.0 \n", "\n", " lag_2 lag_3 lag_7 lag_14 roll_mean_7 roll_mean_14 \\\n", "tanggal \n", "2024-01-01 NaN NaN NaN NaN NaN NaN \n", "2024-01-02 NaN NaN NaN NaN NaN NaN \n", "2024-01-03 3166.0 NaN NaN NaN NaN NaN \n", "2024-01-04 2901.0 3166.0 NaN NaN NaN NaN \n", "2024-01-05 2795.0 2901.0 NaN NaN NaN NaN \n", "2024-01-06 2784.0 2795.0 NaN NaN NaN NaN \n", "2024-01-07 3219.0 2784.0 NaN NaN 2876.142857 NaN \n", "2024-01-08 2863.0 3219.0 3166.0 NaN 2846.571429 NaN \n", "2024-01-09 2405.0 2863.0 2901.0 NaN 2868.857143 NaN \n", "2024-01-10 2959.0 2405.0 2795.0 NaN 2897.142857 NaN \n", "2024-01-11 3057.0 2959.0 2784.0 NaN 2921.142857 NaN \n", "2024-01-12 2993.0 3057.0 3219.0 NaN 2924.428571 NaN \n", "2024-01-13 2952.0 2993.0 2863.0 NaN 3008.285714 NaN \n", "2024-01-14 3242.0 2952.0 2405.0 NaN 3054.571429 2965.357143 \n", "2024-01-15 3450.0 3242.0 2959.0 3166.0 3060.714286 2953.642857 \n", "2024-01-16 2729.0 3450.0 3057.0 2901.0 3044.571429 2956.714286 \n", "2024-01-17 3002.0 2729.0 2993.0 2795.0 3103.000000 3000.071429 \n", "2024-01-18 2944.0 3002.0 2952.0 2784.0 3114.000000 3017.571429 \n", "2024-01-19 3402.0 2944.0 3242.0 3219.0 3055.857143 2990.142857 \n", "2024-01-20 3029.0 3402.0 3450.0 2863.0 3029.857143 3019.071429 \n", "2024-01-21 2835.0 3029.0 2729.0 2405.0 3044.285714 3049.428571 \n", "2024-01-22 3268.0 2835.0 3002.0 2959.0 3029.714286 3045.214286 \n", "2024-01-23 2830.0 3268.0 2944.0 3057.0 3032.428571 3038.500000 \n", "2024-01-24 2900.0 2830.0 3402.0 2993.0 3067.142857 3085.071429 \n", "2024-01-25 2963.0 2900.0 3029.0 2952.0 3067.000000 3090.500000 \n", "2024-01-26 3645.0 2963.0 2835.0 3242.0 3155.428571 3105.642857 \n", "2024-01-27 3028.0 3645.0 3268.0 3450.0 3102.285714 3066.071429 \n", "2024-01-28 3454.0 3028.0 2830.0 2729.0 3059.857143 3052.071429 \n", "2024-01-29 2896.0 3454.0 2900.0 3002.0 3078.571429 3054.142857 \n", "2024-01-30 2533.0 2896.0 2963.0 2944.0 3085.285714 3058.857143 \n", "\n", " roll_std_7 diff_seasonal_7 \n", "tanggal \n", "2024-01-01 NaN NaN \n", "2024-01-02 NaN NaN \n", "2024-01-03 NaN NaN \n", "2024-01-04 NaN NaN \n", "2024-01-05 NaN NaN \n", "2024-01-06 NaN NaN \n", "2024-01-07 270.619307 NaN \n", "2024-01-08 243.630771 -207.0 \n", "2024-01-09 256.247555 156.0 \n", "2024-01-10 257.660260 198.0 \n", "2024-01-11 253.149777 168.0 \n", "2024-01-12 257.767245 23.0 \n", "2024-01-13 321.944982 587.0 \n", "2024-01-14 231.279670 324.0 \n", "2024-01-15 228.876760 43.0 \n", "2024-01-16 233.127903 -113.0 \n", "2024-01-17 266.861387 409.0 \n", "2024-01-18 261.125130 77.0 \n", "2024-01-19 272.919664 -407.0 \n", "2024-01-20 235.171872 -182.0 \n", "2024-01-21 215.949619 101.0 \n", "2024-01-22 222.616797 -102.0 \n", "2024-01-23 221.510615 19.0 \n", "2024-01-24 295.698849 243.0 \n", "2024-01-25 295.720589 -1.0 \n", "2024-01-26 307.113365 619.0 \n", "2024-01-27 316.431517 -372.0 \n", "2024-01-28 373.746540 -297.0 \n", "2024-01-29 367.637891 131.0 \n", "2024-01-30 365.598922 47.0 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.head(30)" ] }, { "cell_type": "code", "execution_count": 9, "id": "f3219a8f", "metadata": {}, "outputs": [], "source": [ "fe_cols = [\n", " 'lag_1', 'lag_2', 'lag_3', 'lag_7', 'lag_14',\n", " 'roll_mean_7', 'roll_mean_14',\n", " 'roll_std_7',\n", " 'diff_seasonal_7'\n", "]\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "9c7aa123", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "y_col = 'jumlah_penumpang_per_hari'\n", "\n", "def make_y_features_one_step(y_history: pd.Series) -> dict:\n", " \"\"\"\n", " Given history of y up to time t-1 (last element), \n", " build features for time t (next step).\n", " \"\"\"\n", " out = {}\n", " # --- lags ---\n", " for lag in [1, 2, 3, 7, 14]:\n", " if len(y_history) >= lag:\n", " out[f'lag_{lag}'] = y_history.iloc[-lag]\n", " else:\n", " out[f'lag_{lag}'] = np.nan\n", "\n", " # --- rolling mean 7 & 14 ---\n", " if len(y_history) >= 7:\n", " window7 = y_history.iloc[-7:]\n", " out['roll_mean_7'] = window7.mean()\n", " out['roll_std_7'] = window7.std()\n", " else:\n", " out['roll_mean_7'] = np.nan\n", " out['roll_std_7'] = np.nan\n", "\n", " if len(y_history) >= 14:\n", " window14 = y_history.iloc[-14:]\n", " out['roll_mean_14'] = window14.mean()\n", " else:\n", " out['roll_mean_14'] = np.nan\n", "\n", " # --- seasonal diff 7 (y_t-1 - y_t-8) ---\n", " if len(y_history) >= 8:\n", " out['diff_seasonal_7'] = y_history.iloc[-1] - y_history.iloc[-8]\n", " else:\n", " out['diff_seasonal_7'] = np.nan\n", "\n", " return out\n" ] }, { "cell_type": "code", "execution_count": 20, "id": "e15adf06", "metadata": {}, "outputs": [], "source": [ "# After you engineered features globally:\n", "# df_train has y_col + fe_cols (+ maybe exogenous features)\n", "df_fe = df_train.dropna().copy()\n", "\n", "fe_cols = [\n", " 'lag_1', 'lag_2', 'lag_3', 'lag_7', 'lag_14',\n", " 'roll_mean_7', 'roll_mean_14',\n", " 'roll_std_7',\n", " 'diff_seasonal_7'\n", "]\n", "\n", "# \"Base\" (non-y, non-y-derived) features that are safe in future\n", "base_cols = [c for c in df_fe.columns if c not in fe_cols + [y_col]]\n" ] }, { "cell_type": "code", "execution_count": 21, "id": "bee97eda", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['is_weekend',\n", " 'is_holiday_nat',\n", " 'flag_contiguous_off',\n", " 'flag_almost_contiguous_off',\n", " 'events']" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "base_cols" ] }, { "cell_type": "code", "execution_count": 13, "id": "a0cf524d", "metadata": {}, "outputs": [], "source": [ "from sklearn.base import clone\n", "from sklearn.metrics import mean_absolute_percentage_error\n", "\n", "def expanding_walk_forward_cv(\n", " df_fe: pd.DataFrame,\n", " y_col: str,\n", " fe_cols: list,\n", " base_cols: list,\n", " model,\n", " val_len: int = 7,\n", " init_train_mult: int = 12,\n", "):\n", " \"\"\"\n", " Expanding-window CV with iterative one-step-ahead forecasting on each val fold.\n", " \n", " - Train uses full engineered features (base + fe_cols).\n", " - Validation does NOT use precomputed y-based features.\n", " Instead, it generates them iteratively using history (train y + preds).\n", " \"\"\"\n", " n = len(df_fe)\n", " init_train_len = init_train_mult * val_len # 12 * 7 by default\n", "\n", " if init_train_len + val_len > n:\n", " raise ValueError(\"Not enough data for initial train + one validation window.\")\n", "\n", " all_fold_preds = [] # list of pd.Series\n", " all_fold_true = [] # list of pd.Series\n", " all_fold_idx = [] # indices of validation windows\n", " all_fold_mape = [] # example metric\n", "\n", " fold = 0\n", " start_train = 0\n", " end_train = init_train_len\n", "\n", " while end_train + val_len <= n:\n", " start_val = end_train\n", " end_val = end_train + val_len\n", "\n", " print(f\"Fold {fold}: train [{start_train}:{end_train}), val [{start_val}:{end_val})\")\n", "\n", " train_df = df_fe.iloc[start_train:end_train]\n", " val_df = df_fe.iloc[start_val:end_val]\n", "\n", " # --- training data uses ALL features precomputed (no leakage here) ---\n", " X_train = train_df[base_cols + fe_cols]\n", " y_train = train_df[y_col]\n", "\n", " # fresh model per fold\n", " mdl = clone(model)\n", " mdl.fit(X_train, y_train)\n", "\n", " # --- validation: we only keep base features and true y for scoring ---\n", " X_val_base = val_df[base_cols].reset_index(drop=True)\n", " y_val_true = val_df[y_col].reset_index(drop=True)\n", "\n", " # drop engineered columns conceptually (we won't use them)\n", " # we rebuild them on the fly from y_history\n", "\n", " # history starts as ALL training y (only real values)\n", " y_history = train_df[y_col].copy()\n", "\n", " preds = []\n", "\n", " for t in range(val_len):\n", " # base features for this step, may be empty\n", " base_feats_dict = X_val_base.iloc[t].to_dict()\n", "\n", " # y-based features from history (train + previous preds)\n", " y_feats_dict = make_y_features_one_step(y_history)\n", "\n", " # merge\n", " x_row_dict = {**base_feats_dict, **y_feats_dict}\n", "\n", " # ensure same column order as X_train\n", " x_row = pd.DataFrame([x_row_dict])[X_train.columns]\n", "\n", " # predict one step ahead\n", " y_pred_t = mdl.predict(x_row)[0]\n", " preds.append(y_pred_t)\n", "\n", " # update history with prediction (NOT true y; no leakage)\n", " # preserve index alignment by using val_df index\n", " y_history = pd.concat(\n", " [y_history, pd.Series([y_pred_t], index=[val_df.index[t]])]\n", " )\n", "\n", " preds = pd.Series(preds, index=val_df.index)\n", " mape = mean_absolute_percentage_error(y_val_true.values, preds.values)\n", "\n", " all_fold_preds.append(preds)\n", " all_fold_true.append(y_val_true.set_axis(val_df.index))\n", " all_fold_idx.append((start_val, end_val))\n", " all_fold_mape.append(mape)\n", "\n", " print(f\" Fold {fold} MAPE: {mape:.4f}\")\n", "\n", " # expanding window: extend train to include this validation block\n", " end_train = end_val\n", " fold += 1\n", "\n", " # stop if no more full val blocks\n", " if end_train + val_len > n:\n", " break\n", "\n", " # concat all predictions / truths if you want one long series\n", " preds_concat = pd.concat(all_fold_preds).sort_index()\n", " true_concat = pd.concat(all_fold_true).sort_index()\n", "\n", " overall_mape = mean_absolute_percentage_error(true_concat.values, preds_concat.values)\n", "\n", " results = {\n", " \"fold_indices\": all_fold_idx,\n", " \"fold_mape\": all_fold_mape,\n", " \"overall_mape\": overall_mape,\n", " \"y_true\": true_concat,\n", " \"y_pred\": preds_concat,\n", " }\n", " return results\n" ] }, { "cell_type": "code", "execution_count": 14, "id": "00bd721e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fold 0: train [0:84), val [84:91)\n", " Fold 0 MAPE: 0.4259\n", "Fold 1: train [0:91), val [91:98)\n", " Fold 1 MAPE: 0.0824\n", "Fold 2: train [0:98), val [98:105)\n", " Fold 2 MAPE: 0.1116\n", "Fold 3: train [0:105), val [105:112)\n", " Fold 3 MAPE: 0.0873\n", "Fold 4: train [0:112), val [112:119)\n", " Fold 4 MAPE: 0.0572\n", "Fold 5: train [0:119), val [119:126)\n", " Fold 5 MAPE: 0.0734\n", "Fold 6: train [0:126), val [126:133)\n", " Fold 6 MAPE: 0.0656\n", "Fold 7: train [0:133), val [133:140)\n", " Fold 7 MAPE: 0.0701\n", "Fold 8: train [0:140), val [140:147)\n", " Fold 8 MAPE: 0.0765\n", "Fold 9: train [0:147), val [147:154)\n", " Fold 9 MAPE: 0.0855\n", "Fold 10: train [0:154), val [154:161)\n", " Fold 10 MAPE: 0.3711\n", "Fold 11: train [0:161), val [161:168)\n", " Fold 11 MAPE: 0.2244\n", "Fold 12: train [0:168), val [168:175)\n", " Fold 12 MAPE: 0.2562\n", "Fold 13: train [0:175), val [175:182)\n", " Fold 13 MAPE: 0.1525\n", "Fold 14: train [0:182), val [182:189)\n", " Fold 14 MAPE: 0.0768\n", "Fold 15: train [0:189), val [189:196)\n", " Fold 15 MAPE: 0.0842\n", "Fold 16: train [0:196), val [196:203)\n", " Fold 16 MAPE: 0.0814\n", "Fold 17: train [0:203), val [203:210)\n", " Fold 17 MAPE: 0.0608\n", "Fold 18: train [0:210), val [210:217)\n", " Fold 18 MAPE: 0.0635\n", "Fold 19: train [0:217), val [217:224)\n", " Fold 19 MAPE: 0.2391\n", "Fold 20: train [0:224), val [224:231)\n", " Fold 20 MAPE: 0.3383\n", "Fold 21: train [0:231), val [231:238)\n", " Fold 21 MAPE: 0.0774\n", "Fold 22: train [0:238), val [238:245)\n", " Fold 22 MAPE: 0.0727\n", "Fold 23: train [0:245), val [245:252)\n", " Fold 23 MAPE: 0.3235\n", "Fold 24: train [0:252), val [252:259)\n", " Fold 24 MAPE: 0.2921\n", "Fold 25: train [0:259), val [259:266)\n", " Fold 25 MAPE: 0.1111\n", "Fold 26: train [0:266), val [266:273)\n", " Fold 26 MAPE: 0.0992\n", "Fold 27: train [0:273), val [273:280)\n", " Fold 27 MAPE: 0.1085\n", "Fold 28: train [0:280), val [280:287)\n", " Fold 28 MAPE: 0.4846\n", "Fold 29: train [0:287), val [287:294)\n", " Fold 29 MAPE: 0.2386\n", "Fold 30: train [0:294), val [294:301)\n", " Fold 30 MAPE: 0.0620\n", "Fold 31: train [0:301), val [301:308)\n", " Fold 31 MAPE: 0.0829\n", "Fold 32: train [0:308), val [308:315)\n", " Fold 32 MAPE: 0.0655\n", "Fold 33: train [0:315), val [315:322)\n", " Fold 33 MAPE: 0.1338\n", "Fold 34: train [0:322), val [322:329)\n", " Fold 34 MAPE: 0.1095\n", "Fold 35: train [0:329), val [329:336)\n", " Fold 35 MAPE: 0.0932\n", "Fold 36: train [0:336), val [336:343)\n", " Fold 36 MAPE: 0.1876\n", "Fold 37: train [0:343), val [343:350)\n", " Fold 37 MAPE: 0.1189\n", "Fold 38: train [0:350), val [350:357)\n", " Fold 38 MAPE: 0.2199\n", "Fold 39: train [0:357), val [357:364)\n", " Fold 39 MAPE: 0.2282\n", "Fold 40: train [0:364), val [364:371)\n", " Fold 40 MAPE: 0.0756\n", "Fold 41: train [0:371), val [371:378)\n", " Fold 41 MAPE: 0.0804\n", "Fold 42: train [0:378), val [378:385)\n", " Fold 42 MAPE: 0.1063\n", "Fold 43: train [0:385), val [385:392)\n", " Fold 43 MAPE: 0.1402\n", "Fold 44: train [0:392), val [392:399)\n", " Fold 44 MAPE: 0.2496\n", "Fold 45: train [0:399), val [399:406)\n", " Fold 45 MAPE: 0.2012\n", "Fold 46: train [0:406), val [406:413)\n", " Fold 46 MAPE: 0.1642\n", "Fold 47: train [0:413), val [413:420)\n", " Fold 47 MAPE: 0.1073\n", "Fold 48: train [0:420), val [420:427)\n", " Fold 48 MAPE: 0.0700\n", "Fold 49: train [0:427), val [427:434)\n", " Fold 49 MAPE: 0.1021\n", "Fold 50: train [0:434), val [434:441)\n", " Fold 50 MAPE: 0.1329\n", "Fold 51: train [0:441), val [441:448)\n", " Fold 51 MAPE: 0.1902\n", "Fold 52: train [0:448), val [448:455)\n", " Fold 52 MAPE: 0.1325\n", "Fold 53: train [0:455), val [455:462)\n", " Fold 53 MAPE: 0.0906\n", "Fold 54: train [0:462), val [462:469)\n", " Fold 54 MAPE: 0.1682\n", "Fold 55: train [0:469), val [469:476)\n", " Fold 55 MAPE: 0.2515\n", "Fold 56: train [0:476), val [476:483)\n", " Fold 56 MAPE: 0.0723\n", "Fold 57: train [0:483), val [483:490)\n", " Fold 57 MAPE: 0.0581\n", "Fold 58: train [0:490), val [490:497)\n", " Fold 58 MAPE: 0.1544\n", "Fold 59: train [0:497), val [497:504)\n", " Fold 59 MAPE: 0.0752\n", "Fold 60: train [0:504), val [504:511)\n", " Fold 60 MAPE: 0.1152\n", "Fold 61: train [0:511), val [511:518)\n", " Fold 61 MAPE: 0.0827\n", "Fold 62: train [0:518), val [518:525)\n", " Fold 62 MAPE: 0.1680\n", "Fold 63: train [0:525), val [525:532)\n", " Fold 63 MAPE: 0.9767\n", "Fold 64: train [0:532), val [532:539)\n", " Fold 64 MAPE: 0.4159\n", "Overall MAPE: 0.162676140666008\n" ] } ], "source": [ "from xgboost import XGBRegressor\n", "\n", "model = XGBRegressor(\n", " n_estimators=500,\n", " max_depth=4,\n", " learning_rate=0.05,\n", " subsample=0.9,\n", " colsample_bytree=0.9,\n", " objective='reg:squarederror',\n", " random_state=1618\n", ")\n", "\n", "res = expanding_walk_forward_cv(\n", " df_fe=df_fe,\n", " y_col=y_col,\n", " fe_cols=fe_cols,\n", " base_cols=base_cols,\n", " model=model,\n", " val_len=7,\n", " init_train_mult=12\n", ")\n", "\n", "print(\"Overall MAPE:\", res[\"overall_mape\"])\n" ] }, { "cell_type": "code", "execution_count": 15, "id": "412d42b7", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\Muhammad Hafiz F\\Documents\\ali\\2025-2026 (Sem 5)\\MPDW\\Projek UAS\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import optuna\n", "from xgboost import XGBRegressor\n", "\n", "def objective(trial):\n", " # ---- Hyperparameter search space ----\n", " params = {\n", " \"n_estimators\": trial.suggest_int(\"n_estimators\", 200, 1200),\n", " \"max_depth\": trial.suggest_int(\"max_depth\", 2, 10),\n", " \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.005, 0.3, log=True),\n", " \"subsample\": trial.suggest_float(\"subsample\", 0.5, 1.0),\n", " \"colsample_bytree\": trial.suggest_float(\"colsample_bytree\", 0.5, 1.0),\n", " \"min_child_weight\": trial.suggest_float(\"min_child_weight\", 1.0, 20.0),\n", " \"gamma\": trial.suggest_float(\"gamma\", 0.0, 10.0),\n", " \"reg_lambda\": trial.suggest_float(\"reg_lambda\", 1e-3, 10.0, log=True),\n", " \"reg_alpha\": trial.suggest_float(\"reg_alpha\", 1e-3, 10.0, log=True),\n", " \"objective\": \"reg:squarederror\",\n", " \"random_state\": 1618,\n", " \"n_jobs\": -1,\n", " }\n", "\n", " model = XGBRegressor(**params)\n", "\n", " # ---- Expanding-window CV with iterative 1-step forecasting ----\n", " res = expanding_walk_forward_cv(\n", " df_fe=df_fe,\n", " y_col=y_col,\n", " fe_cols=fe_cols,\n", " base_cols=base_cols,\n", " model=model,\n", " val_len=7,\n", " init_train_mult=12,\n", " )\n", "\n", " overall_mape = res[\"overall_mape\"]\n", "\n", " # Optuna minimizes by default if direction=\"minimize\"\n", " return overall_mape\n" ] }, { "cell_type": "code", "execution_count": 16, "id": "67ef1a28", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[I 2025-11-21 06:40:38,847] A new study created in memory with name: xgb_lrt_walkforward\n", " 0%| | 0/50 [00:00 0:\n", " raise ValueError(\"base_cols is non-empty, but df_future_base is None.\")\n", " # infer time index if possible, otherwise use simple RangeIndex\n", " idx = df_history.index\n", " if isinstance(idx, pd.DatetimeIndex):\n", " freq = pd.infer_freq(idx)\n", " if freq is None:\n", " # fallback: assume daily\n", " freq = \"D\"\n", " start = idx[-1] + to_offset(freq)\n", " future_index = pd.date_range(start=start, periods=h, freq=freq)\n", " else:\n", " start = idx[-1] + 1\n", " future_index = pd.RangeIndex(start=start, stop=start + h)\n", " # empty base feature frame\n", " X_base_future = pd.DataFrame(index=range(h))\n", "\n", " preds = []\n", "\n", " for t in range(h):\n", " # base features for this step\n", " if len(base_cols) > 0:\n", " base_feats_dict = X_base_future.iloc[t].to_dict()\n", " else:\n", " base_feats_dict = {}\n", "\n", " # y-based features from history (train + previous preds)\n", " y_feats_dict = make_y_features_one_step(y_history)\n", "\n", " # merge to one feature row\n", " x_row_dict = {**base_feats_dict, **y_feats_dict}\n", "\n", " # column order must match training\n", " # NOTE: we assume the model was trained on (base_cols + fe_cols) in that order\n", " all_cols = base_cols + fe_cols\n", " x_row = pd.DataFrame([x_row_dict])[all_cols]\n", "\n", " # predict next step\n", " y_pred_t = model.predict(x_row)[0]\n", " preds.append(y_pred_t)\n", "\n", " # update history with prediction (no true y, no leakage)\n", " y_history = pd.concat(\n", " [y_history, pd.Series([y_pred_t], index=[future_index[t]])]\n", " )\n", "\n", " preds = pd.Series(preds, index=future_index, name=f\"{y_col}_forecast\")\n", " return preds\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "8717b801", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "event_dates = pd.to_datetime([\n", " \"2025-04-24\", # Promo Hari Kartini\n", " \"2024-06-22\", # Promo HUT Jakarta\n", " \"2025-06-22\", # Promo HUT Jakarta\n", " \"2024-06-23\", # Promo HUT Jakarta (masih)\n", " \"2025-07-01\" # Promo Hari Bhayangkara\n", "]).normalize()\n" ] }, { "cell_type": "code", "execution_count": 26, "id": "0573dddc", "metadata": {}, "outputs": [], "source": [ "import pandas as np\n", "import numpy as np\n", "import holidays\n", "\n", "def build_calendar_exo(idx: pd.DatetimeIndex) -> pd.DataFrame:\n", " if not isinstance(idx, pd.DatetimeIndex):\n", " idx = pd.to_datetime(idx)\n", "\n", " full_idx = pd.date_range(idx.min(), idx.max(), freq=\"D\")\n", " df_full = pd.DataFrame(index=full_idx)\n", "\n", " # weekday + weekend\n", " df_full[\"weekday_num\"] = df_full.index.weekday\n", " df_full[\"is_weekend\"] = df_full[\"weekday_num\"].isin([5, 6]).astype(int)\n", "\n", " # national holidays\n", " years = sorted(set(df_full.index.year))\n", " id_hdays = holidays.country_holidays(\"ID\", years=years)\n", " df_full[\"holiday_name\"] = df_full.index.map(lambda d: id_hdays.get(d, None))\n", " df_full[\"is_holiday_nat\"] = df_full[\"holiday_name\"].notna().astype(int)\n", "\n", " # >>> NEW: events <<<\n", " df_full[\"events\"] = df_full.index.normalize().isin(event_dates).astype(int)\n", "\n", " # unified OFF\n", " df_full[\"is_off\"] = ((df_full[\"is_weekend\"] == 1) | (df_full[\"is_holiday_nat\"] == 1)).astype(int)\n", "\n", " # contiguous blocks\n", " is_off = df_full[\"is_off\"].values\n", " block_id = []\n", " curr_id, prev_off = -1, False\n", " for flag in is_off:\n", " if flag == 1 and not prev_off:\n", " curr_id += 1\n", " block_id.append(curr_id if flag == 1 else -1)\n", " prev_off = (flag == 1)\n", " df_full[\"off_block_id\"] = block_id\n", "\n", " off_len = (\n", " df_full[df_full[\"off_block_id\"] >= 0]\n", " .groupby(\"off_block_id\")\n", " .size()\n", " .rename(\"off_block_len\")\n", " )\n", " df_full = df_full.merge(\n", " off_len, left_on=\"off_block_id\", right_index=True, how=\"left\"\n", " )\n", " df_full[\"off_block_len\"] = df_full[\"off_block_len\"].fillna(0).astype(int)\n", "\n", " df_full[\"flag_contiguous_off\"] = (\n", " (df_full[\"is_off\"] == 1) & (df_full[\"off_block_len\"] >= 3)\n", " ).astype(int)\n", "\n", " # almost-contiguous (1-0-1)\n", " is_off = df_full[\"is_off\"].values\n", " prev_off = np.r_[False, is_off[:-1] == 1]\n", " next_off = np.r_[is_off[1:] == 1, False]\n", " bridge_mask = (is_off == 0) & prev_off & next_off\n", "\n", " almost_mask = np.zeros(len(df_full), dtype=bool)\n", " if bridge_mask.any():\n", " bridge_idx = np.where(bridge_mask)[0]\n", " for b in bridge_idx:\n", " L = b - 1\n", " while L >= 0 and is_off[L] == 1:\n", " almost_mask[L] = True\n", " L -= 1\n", " R = b + 1\n", " while R < len(df_full) and is_off[R] == 1:\n", " almost_mask[R] = True\n", " R += 1\n", "\n", " df_full[\"flag_almost_contiguous_off\"] = (\n", " almost_mask & (df_full[\"is_off\"] == 1)\n", " ).astype(int)\n", "\n", " # return only what you actually use in base_cols\n", " df_out = df_full.loc[idx, [\n", " \"is_weekend\",\n", " \"is_holiday_nat\",\n", " \"flag_contiguous_off\",\n", " \"flag_almost_contiguous_off\",\n", " \"events\", # <<< make sure this is here\n", " ]]\n", "\n", " return df_out\n" ] }, { "cell_type": "code", "execution_count": 27, "id": "4c6a95eb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
is_weekendis_holiday_natflag_contiguous_offflag_almost_contiguous_offevents
2025-07-0700000
2025-07-0800000
2025-07-0900000
2025-07-1000000
2025-07-1100000
2025-07-1210000
2025-07-1310000
2025-07-1400000
2025-07-1500000
2025-07-1600000
2025-07-1700000
2025-07-1800000
2025-07-1910000
2025-07-2010000
2025-07-2100000
2025-07-2200000
2025-07-2300000
2025-07-2400000
2025-07-2500000
2025-07-2610000
2025-07-2710000
2025-07-2800000
2025-07-2900000
2025-07-3000000
2025-07-3100000
2025-08-0100000
2025-08-0210000
2025-08-0310000
2025-08-0400000
2025-08-0500000
2025-08-0600000
2025-08-0700000
2025-08-0800000
2025-08-0910000
2025-08-1010000
2025-08-1100000
2025-08-1200000
2025-08-1300000
2025-08-1400000
2025-08-1500000
2025-08-1610000
2025-08-1711000
2025-08-1800000
2025-08-1900000
2025-08-2000000
2025-08-2100000
2025-08-2200000
2025-08-2310000
2025-08-2410000
2025-08-2500000
2025-08-2600000
2025-08-2700000
2025-08-2800000
2025-08-2900000
2025-08-3010000
2025-08-3110000
\n", "
" ], "text/plain": [ " is_weekend is_holiday_nat flag_contiguous_off \\\n", "2025-07-07 0 0 0 \n", "2025-07-08 0 0 0 \n", "2025-07-09 0 0 0 \n", "2025-07-10 0 0 0 \n", "2025-07-11 0 0 0 \n", "2025-07-12 1 0 0 \n", "2025-07-13 1 0 0 \n", "2025-07-14 0 0 0 \n", "2025-07-15 0 0 0 \n", "2025-07-16 0 0 0 \n", "2025-07-17 0 0 0 \n", "2025-07-18 0 0 0 \n", "2025-07-19 1 0 0 \n", "2025-07-20 1 0 0 \n", "2025-07-21 0 0 0 \n", "2025-07-22 0 0 0 \n", "2025-07-23 0 0 0 \n", "2025-07-24 0 0 0 \n", "2025-07-25 0 0 0 \n", "2025-07-26 1 0 0 \n", "2025-07-27 1 0 0 \n", "2025-07-28 0 0 0 \n", "2025-07-29 0 0 0 \n", "2025-07-30 0 0 0 \n", "2025-07-31 0 0 0 \n", "2025-08-01 0 0 0 \n", "2025-08-02 1 0 0 \n", "2025-08-03 1 0 0 \n", "2025-08-04 0 0 0 \n", "2025-08-05 0 0 0 \n", "2025-08-06 0 0 0 \n", "2025-08-07 0 0 0 \n", "2025-08-08 0 0 0 \n", "2025-08-09 1 0 0 \n", "2025-08-10 1 0 0 \n", "2025-08-11 0 0 0 \n", "2025-08-12 0 0 0 \n", "2025-08-13 0 0 0 \n", "2025-08-14 0 0 0 \n", "2025-08-15 0 0 0 \n", "2025-08-16 1 0 0 \n", "2025-08-17 1 1 0 \n", "2025-08-18 0 0 0 \n", "2025-08-19 0 0 0 \n", "2025-08-20 0 0 0 \n", "2025-08-21 0 0 0 \n", "2025-08-22 0 0 0 \n", "2025-08-23 1 0 0 \n", "2025-08-24 1 0 0 \n", "2025-08-25 0 0 0 \n", "2025-08-26 0 0 0 \n", "2025-08-27 0 0 0 \n", "2025-08-28 0 0 0 \n", "2025-08-29 0 0 0 \n", "2025-08-30 1 0 0 \n", "2025-08-31 1 0 0 \n", "\n", " flag_almost_contiguous_off events \n", "2025-07-07 0 0 \n", "2025-07-08 0 0 \n", "2025-07-09 0 0 \n", "2025-07-10 0 0 \n", "2025-07-11 0 0 \n", "2025-07-12 0 0 \n", "2025-07-13 0 0 \n", "2025-07-14 0 0 \n", "2025-07-15 0 0 \n", "2025-07-16 0 0 \n", "2025-07-17 0 0 \n", "2025-07-18 0 0 \n", "2025-07-19 0 0 \n", "2025-07-20 0 0 \n", "2025-07-21 0 0 \n", "2025-07-22 0 0 \n", "2025-07-23 0 0 \n", "2025-07-24 0 0 \n", "2025-07-25 0 0 \n", "2025-07-26 0 0 \n", "2025-07-27 0 0 \n", "2025-07-28 0 0 \n", "2025-07-29 0 0 \n", "2025-07-30 0 0 \n", "2025-07-31 0 0 \n", "2025-08-01 0 0 \n", "2025-08-02 0 0 \n", "2025-08-03 0 0 \n", "2025-08-04 0 0 \n", "2025-08-05 0 0 \n", "2025-08-06 0 0 \n", "2025-08-07 0 0 \n", "2025-08-08 0 0 \n", "2025-08-09 0 0 \n", "2025-08-10 0 0 \n", "2025-08-11 0 0 \n", "2025-08-12 0 0 \n", "2025-08-13 0 0 \n", "2025-08-14 0 0 \n", "2025-08-15 0 0 \n", "2025-08-16 0 0 \n", "2025-08-17 0 0 \n", "2025-08-18 0 0 \n", "2025-08-19 0 0 \n", "2025-08-20 0 0 \n", "2025-08-21 0 0 \n", "2025-08-22 0 0 \n", "2025-08-23 0 0 \n", "2025-08-24 0 0 \n", "2025-08-25 0 0 \n", "2025-08-26 0 0 \n", "2025-08-27 0 0 \n", "2025-08-28 0 0 \n", "2025-08-29 0 0 \n", "2025-08-30 0 0 \n", "2025-08-31 0 0 " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from pandas.tseries.frequencies import to_offset\n", "\n", "h = 56\n", "\n", "idx_hist = df_fe.index # DatetimeIndex of your training history\n", "freq = pd.infer_freq(idx_hist)\n", "if freq is None:\n", " freq = \"D\"\n", "\n", "# build future index\n", "start_future = idx_hist[-1] + to_offset(freq)\n", "future_idx = pd.date_range(start=start_future, periods=h, freq=freq)\n", "\n", "# IMPORTANT:\n", "# contiguity of off-days should be computed on history + future together\n", "full_idx = pd.date_range(idx_hist.min(), future_idx.max(), freq=freq)\n", "\n", "# build flags on the full range\n", "flags_full = build_calendar_exo(full_idx)\n", "\n", "# slice only the future part\n", "df_future_base_56 = flags_full.loc[future_idx, :]\n", "\n", "# keep only the base_cols used by the model, in the same order\n", "df_future_base_56 = df_future_base_56[base_cols]\n", "\n", "df_future_base_56" ] }, { "cell_type": "code", "execution_count": 31, "id": "9e47552b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2025-07-07 3364.089844\n", "2025-07-08 3595.881836\n", "2025-07-09 3107.742188\n", "2025-07-10 3074.942383\n", "2025-07-11 3143.840332\n", "Freq: D, Name: jumlah_penumpang_per_hari_forecast, dtype: float32" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "h = 56\n", "\n", "future_56 = forecast_h_steps(\n", " model=final_model,\n", " df_history=df_fe, # your dropna()’d engineered df\n", " y_col=y_col,\n", " fe_cols=fe_cols,\n", " base_cols=base_cols,\n", " h=h,\n", " df_future_base=df_future_base_56\n", ")\n", "\n", "future_56.head()" ] }, { "cell_type": "code", "execution_count": 33, "id": "2e08e038", "metadata": {}, "outputs": [], "source": [ "# make future_56 a dataframe for easier handling\n", "future_56 = future_56.to_frame()\n", "future_56.to_csv('lrt_forecast_xgboost_56_days.csv')" ] }, { "cell_type": "code", "execution_count": 30, "id": "93688bcf", "metadata": {}, "outputs": [], "source": [ "y_test = df_test[y_col]" ] }, { "cell_type": "code", "execution_count": 32, "id": "e6a632f1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MAPE 56-step forecast: 0.12389647960662842 ( 12.389647960662842 % )\n" ] } ], "source": [ "from sklearn.metrics import mean_absolute_percentage_error\n", "import numpy as np\n", "\n", "# avoid division by zero: only keep non-zero y\n", "mask = y_test != 0\n", "\n", "mape_56 = mean_absolute_percentage_error(\n", " y_test[mask].values,\n", " future_56[mask].values\n", ")\n", "\n", "print(\"MAPE 56-step forecast:\", mape_56, \"(\", mape_56 * 100, \"% )\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c6028d84", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 5 }