利用Python进行数据分析_Pandas_数据规整

发布时间 2023-12-20 14:47:48作者: 空空空空空空空空空空

数据规整

1.时间序列以及截面对齐

import pandas as pd
import numpy as np
from pandas import Series,DataFrame
import warnings
warnings.filterwarnings("ignore")
# 设置一个日期范围
date_range = pd.date_range(start="2023-01-01", end="2023-01-10", freq="D")

# 为4只股票生成随机股价
stock_symbols = ['AAPL', 'GOOGL', 'MSFT', 'AMZN']
#np.random.rand 用于生成指定形状的在 [0.0, 1.0) 范围内均匀分布的随机数。
prices_data = np.random.rand(len(date_range), len(stock_symbols)) * 100  # 随机价格
prices = pd.DataFrame(prices_data, index=date_range, columns=stock_symbols)

# 为相同的4只股票生成随机交易量,但日期范围是s1的子集
subset_dates = date_range[::2]  # 使用每隔两天的日期作为s2的子集
volumes_data = np.random.randint(100000, 500000, size=(len(subset_dates), len(stock_symbols)))  # 随机交易量
volumes = pd.DataFrame(volumes_data, index=subset_dates, columns=stock_symbols)

# 打印生成的DataFrame
prices
AAPL GOOGL MSFT AMZN
2023-01-01 14.314858 3.770164 47.853384 52.129960
2023-01-02 49.168337 16.809880 50.745822 4.065592
2023-01-03 51.006419 87.196374 56.078768 9.049886
2023-01-04 21.995947 8.197457 41.555084 57.651605
2023-01-05 46.431166 83.819638 78.740614 80.649507
2023-01-06 55.849528 89.490260 22.954482 62.232844
2023-01-07 93.226985 56.326575 26.826220 55.494495
2023-01-08 78.876867 88.315382 58.793917 13.255849
2023-01-09 69.001249 48.880222 49.288958 56.896331
2023-01-10 8.710692 96.346660 40.227193 18.281541
volumes
AAPL GOOGL MSFT AMZN
2023-01-01 300532 364114 475873 490740
2023-01-03 464820 447201 323960 427487
2023-01-05 470647 452899 205389 168949
2023-01-07 113168 482721 184416 387871
2023-01-09 271816 232564 285368 121470
prices*volumes
AAPL GOOGL MSFT AMZN
2023-01-01 4.302073e+06 1.372770e+06 2.277213e+07 2.558226e+07
2023-01-02 NaN NaN NaN NaN
2023-01-03 2.370880e+07 3.899431e+07 1.816728e+07 3.868709e+06
2023-01-04 NaN NaN NaN NaN
2023-01-05 2.185269e+07 3.796183e+07 1.617246e+07 1.362565e+07
2023-01-06 NaN NaN NaN NaN
2023-01-07 1.055031e+07 2.719002e+07 4.947184e+06 2.152471e+07
2023-01-08 NaN NaN NaN NaN
2023-01-09 1.875564e+07 1.136778e+07 1.406549e+07 6.911197e+06
2023-01-10 NaN NaN NaN NaN

通过一组索引不同的Series构建一个DataFrame

import pandas as pd
import numpy as np

# 生成一些示例数据
data1 = np.random.randn(5)  # 5个随机数
data2 = np.random.randint(1, 10, 6)  # 6个在 [1, 10) 范围内的随机整数
data3 = np.random.random(4)  # 4个在 [0.0, 1.0) 范围内的随机小数
data4 = ["apple", "banana", "orange", "grape"]  # 一些字符串数据

# 创建四个具有不同索引的 Series
index1 = [10, 20, 30, 40, 50]
index2 = ['A', 'B', 'C', 'D', 'E', 'F']
index3 = pd.date_range('2023-01-01', periods=4, freq='D')
index4 = ['one', 'two', 'three', 'four']

series1 = pd.Series(data1, index=index1)
series2 = pd.Series(data2, index=index2)
series3 = pd.Series(data3, index=index3)
series4 = pd.Series(data4, index=index4)

DataFrame({'one':series1,'two':series2, 'three':series3, 'four':series4})
C:\Users\hspcadmin\AppData\Local\Temp\ipykernel_13588\3484969428.py:21: RuntimeWarning: '<' not supported between instances of 'Timestamp' and 'int', sort order is undefined for incomparable objects.
  DataFrame({'one':series1,'two':series2, 'three':series3, 'four':series4})
one two three four
10 -0.967830 NaN NaN NaN
20 -2.051181 NaN NaN NaN
30 0.816328 NaN NaN NaN
40 1.028584 NaN NaN NaN
50 -0.017745 NaN NaN NaN
A NaN 7.0 NaN NaN
B NaN 2.0 NaN NaN
C NaN 6.0 NaN NaN
D NaN 1.0 NaN NaN
E NaN 5.0 NaN NaN
F NaN 3.0 NaN NaN
2023-01-01 00:00:00 NaN NaN 0.851957 NaN
2023-01-02 00:00:00 NaN NaN 0.241342 NaN
2023-01-03 00:00:00 NaN NaN 0.514155 NaN
2023-01-04 00:00:00 NaN NaN 0.093532 NaN
one NaN NaN NaN apple
two NaN NaN NaN banana
three NaN NaN NaN orange
four NaN NaN NaN grape

2.频率不同的时间序列的计算

t = Series(np.random.randn(3),index=pd.date_range('2023-01-01',periods=3,freq='W-WED'))
t
2023-01-04    0.143276
2023-01-11   -0.917840
2023-01-18   -1.320858
Freq: W-WED, dtype: float64
tt_resample = t.resample('B')
tt_resample
<pandas.core.resample.DatetimeIndexResampler object at 0x000001775C717C50>
tt_resample_ffill = t.resample('B').ffill()
tt_resample_ffill
2023-01-04    0.143276
2023-01-05    0.143276
2023-01-06    0.143276
2023-01-09    0.143276
2023-01-10    0.143276
2023-01-11   -0.917840
2023-01-12   -0.917840
2023-01-13   -0.917840
2023-01-16   -0.917840
2023-01-17   -0.917840
2023-01-18   -1.320858
Freq: B, dtype: float64
t.reindex(tt_resample_ffill.index).ffill()
2023-01-04    0.143276
2023-01-05    0.143276
2023-01-06    0.143276
2023-01-09    0.143276
2023-01-10    0.143276
2023-01-11   -0.917840
2023-01-12   -0.917840
2023-01-13   -0.917840
2023-01-16   -0.917840
2023-01-17   -0.917840
2023-01-18   -1.320858
Freq: B, dtype: float64
# 每年 3 月底结束一个季度
q_mar = pd.date_range('2023-01-01', periods=4, freq='Q-MAR')
# 每年 9 月底结束一个季度
q_sep = pd.date_range('2023-01-01', periods=4, freq='Q-SEP')
q_sep
DatetimeIndex(['2023-03-31', '2023-06-30', '2023-09-30', '2023-12-31'], dtype='datetime64[ns]', freq='Q-SEP')
q_mar
DatetimeIndex(['2023-03-31', '2023-06-30', '2023-09-30', '2023-12-31'], dtype='datetime64[ns]', freq='Q-MAR')

3.时间和“最当前”数据的选取

import akshare as ak
stock_zh_a_tick_tx_js_df = ak.stock_zh_a_tick_tx_js(symbol="sh600570")
stock_zh_a_tick_tx_js_df
D:\Program Files\Python\Python311\Lib\site-packages\akshare\stock\stock_zh_a_tick_tx.py:27: UserWarning: 正在下载数据,请稍等
  warnings.warn("正在下载数据,请稍等")
成交时间 成交价格 价格变动 成交量 成交金额 性质
0 09:25:02 28.55 0.00 485 1384675 卖盘
1 09:30:02 28.55 0.00 31 88492 买盘
2 09:30:05 28.56 0.01 172 491086 买盘
3 09:30:08 28.59 0.03 55 157070 买盘
4 09:30:11 28.55 -0.04 142 405509 卖盘
... ... ... ... ... ... ...
688 10:04:50 28.07 0.00 33 92641 卖盘
689 10:04:53 28.08 0.01 5 14038 买盘
690 10:04:56 28.08 0.00 6 16847 买盘
691 10:04:59 28.07 -0.01 77 216164 卖盘
692 10:05:02 28.07 0.00 129 362237 卖盘

693 rows × 6 columns

from datetime import time
stock_zh_a_tick_tx_js_df.to_timestamp
<bound method DataFrame.to_timestamp of                          成交时间   成交价格  价格变动  成交量     成交金额  性质
成交时间                                                        
1900-01-01 09:25:02  09:25:02  28.55  0.00  485  1384675  卖盘
1900-01-01 09:30:02  09:30:02  28.55  0.00   31    88492  买盘
1900-01-01 09:30:05  09:30:05  28.56  0.01  172   491086  买盘
1900-01-01 09:30:08  09:30:08  28.59  0.03   55   157070  买盘
1900-01-01 09:30:11  09:30:11  28.55 -0.04  142   405509  卖盘
...                       ...    ...   ...  ...      ...  ..
1900-01-01 10:04:50  10:04:50  28.07  0.00   33    92641  卖盘
1900-01-01 10:04:53  10:04:53  28.08  0.01    5    14038  买盘
1900-01-01 10:04:56  10:04:56  28.08  0.00    6    16847  买盘
1900-01-01 10:04:59  10:04:59  28.07 -0.01   77   216164  卖盘
1900-01-01 10:05:02  10:05:02  28.07  0.00  129   362237  卖盘

[693 rows x 6 columns]>
# 将时间戳列设置为 DatetimeIndex
stock_zh_a_tick_tx_js_df.index = pd.to_datetime(stock_zh_a_tick_tx_js_df['成交时间'], format='%H:%M:%S')
stock_zh_a_tick_tx_js_df
成交时间 成交价格 价格变动 成交量 成交金额 性质
成交时间
1900-01-01 09:25:02 09:25:02 28.55 0.00 485 1384675 卖盘
1900-01-01 09:30:02 09:30:02 28.55 0.00 31 88492 买盘
1900-01-01 09:30:05 09:30:05 28.56 0.01 172 491086 买盘
1900-01-01 09:30:08 09:30:08 28.59 0.03 55 157070 买盘
1900-01-01 09:30:11 09:30:11 28.55 -0.04 142 405509 卖盘
... ... ... ... ... ... ...
1900-01-01 10:04:50 10:04:50 28.07 0.00 33 92641 卖盘
1900-01-01 10:04:53 10:04:53 28.08 0.01 5 14038 买盘
1900-01-01 10:04:56 10:04:56 28.08 0.00 6 16847 买盘
1900-01-01 10:04:59 10:04:59 28.07 -0.01 77 216164 卖盘
1900-01-01 10:05:02 10:05:02 28.07 0.00 129 362237 卖盘

693 rows × 6 columns

selected_rows = stock_zh_a_tick_tx_js_df.between_time(time(9, 50), time(10, 1))
selected_rows
成交时间 成交价格 价格变动 成交量 成交金额 性质
成交时间
1900-01-01 09:50:02 09:50:02 27.91 -0.02 304 848440 卖盘
1900-01-01 09:50:05 09:50:05 27.90 -0.01 879 2451488 卖盘
1900-01-01 09:50:08 09:50:08 27.89 -0.01 149 415565 买盘
1900-01-01 09:50:11 09:50:11 27.88 -0.01 114 317867 卖盘
1900-01-01 09:50:14 09:50:14 27.89 0.01 47 131033 买盘
... ... ... ... ... ... ...
1900-01-01 10:00:47 10:00:47 28.05 0.01 15 42061 买盘
1900-01-01 10:00:50 10:00:50 28.04 -0.01 37 103770 卖盘
1900-01-01 10:00:53 10:00:53 28.06 0.02 39 109408 买盘
1900-01-01 10:00:56 10:00:56 28.11 0.05 34 95503 买盘
1900-01-01 10:00:59 10:00:59 28.10 -0.01 115 323101 卖盘

216 rows × 6 columns

selected_rows_at_time = stock_zh_a_tick_tx_js_df.at_time(time(9, 50))
selected_rows_at_time
成交时间 成交价格 价格变动 成交量 成交金额 性质
成交时间

4.拼接多个数据源

import akshare as ak
import pandas as pd

symbols = ['000001', '600570']
historical_data = {}

for symbol in symbols:
    data = ak.stock_zh_a_hist(
        symbol=symbol,
        period="daily",
        start_date="20230101",
        end_date='20231219',
        adjust="qfq"
    )
    historical_data[symbol] = data
df_list = [data for data in historical_data.values()]
result_df = pd.concat(df_list, keys=historical_data.keys(), axis=1)
result_df
000001 ... 600570
日期 开盘 收盘 最高 最低 成交量 成交额 振幅 涨跌幅 涨跌额 ... 开盘 收盘 最高 最低 成交量 成交额 振幅 涨跌幅 涨跌额 换手率
0 2023-01-03 12.92 13.49 13.57 12.77 2194128 2.971547e+09 6.21 4.74 0.61 ... 40.22 41.77 42.04 39.98 188785 7.876238e+08 5.11 3.57 1.44 0.99
1 2023-01-04 13.43 14.04 14.14 13.35 2189683 3.110729e+09 5.86 4.08 0.55 ... 41.94 42.63 42.92 41.24 257842 1.088131e+09 4.02 2.06 0.86 1.36
2 2023-01-05 14.12 14.20 14.46 14.09 1665425 2.417272e+09 2.64 1.14 0.16 ... 42.19 41.96 42.61 41.57 99206 4.184562e+08 2.44 -1.57 -0.67 0.52
3 2023-01-06 14.22 14.34 14.44 14.20 1195745 1.747915e+09 1.69 0.99 0.14 ... 42.30 42.08 43.87 41.57 218465 9.297619e+08 5.48 0.29 0.12 1.15
4 2023-01-09 14.47 14.52 14.60 14.24 1057659 1.561368e+09 2.51 1.26 0.18 ... 41.45 42.40 42.98 40.77 191544 8.106186e+08 5.25 0.76 0.32 1.01
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
229 2023-12-13 9.38 9.16 9.39 9.15 1061302 9.810012e+08 2.55 -2.76 -0.26 ... 29.78 29.34 30.16 29.33 235863 7.014522e+08 2.79 -1.28 -0.38 1.24
230 2023-12-14 9.21 9.15 9.28 9.15 742901 6.832115e+08 1.42 -0.11 -0.01 ... 29.60 29.17 29.87 29.15 155482 4.586276e+08 2.45 -0.58 -0.17 0.82
231 2023-12-15 9.20 9.21 9.35 9.19 988939 9.151261e+08 1.75 0.66 0.06 ... 29.32 28.77 29.36 28.71 185556 5.368904e+08 2.23 -1.37 -0.40 0.98
232 2023-12-18 9.18 9.13 9.24 9.09 654426 5.993790e+08 1.63 -0.87 -0.08 ... 28.55 28.49 29.18 28.30 160454 4.597516e+08 3.06 -0.97 -0.28 0.84
233 2023-12-19 9.12 9.10 9.17 9.07 644071 5.867167e+08 1.10 -0.33 -0.03 ... 28.48 28.66 28.81 28.27 150101 4.282943e+08 1.90 0.60 0.17 0.79

234 rows × 22 columns

5.收益指数和累计收益

import akshare as ak
stock_zh_a_hist_df = ak.stock_zh_a_hist(symbol="600570", period="daily", start_date="20230101", end_date='20231219', adjust="qfq")
stock_zh_a_hist_df
日期 开盘 收盘 最高 最低 成交量 成交额 振幅 涨跌幅 涨跌额 换手率
0 2023-01-03 40.22 41.77 42.04 39.98 188785 7.876238e+08 5.11 3.57 1.44 0.99
1 2023-01-04 41.94 42.63 42.92 41.24 257842 1.088131e+09 4.02 2.06 0.86 1.36
2 2023-01-05 42.19 41.96 42.61 41.57 99206 4.184562e+08 2.44 -1.57 -0.67 0.52
3 2023-01-06 42.30 42.08 43.87 41.57 218465 9.297619e+08 5.48 0.29 0.12 1.15
4 2023-01-09 41.45 42.40 42.98 40.77 191544 8.106186e+08 5.25 0.76 0.32 1.01
... ... ... ... ... ... ... ... ... ... ... ...
229 2023-12-13 29.78 29.34 30.16 29.33 235863 7.014522e+08 2.79 -1.28 -0.38 1.24
230 2023-12-14 29.60 29.17 29.87 29.15 155482 4.586276e+08 2.45 -0.58 -0.17 0.82
231 2023-12-15 29.32 28.77 29.36 28.71 185556 5.368904e+08 2.23 -1.37 -0.40 0.98
232 2023-12-18 28.55 28.49 29.18 28.30 160454 4.597516e+08 3.06 -0.97 -0.28 0.84
233 2023-12-19 28.48 28.66 28.81 28.27 150101 4.282943e+08 1.90 0.60 0.17 0.79

234 rows × 11 columns

# 将日期列转换为 datetime 类型的索引
stock_zh_a_hist_df.index = pd.to_datetime(stock_zh_a_hist_df['日期'])

# 选择特定日期的数据,比如 "2023-01-05"
selected_date_1 = stock_zh_a_hist_df.loc['2023-01-05']
selected_date_2 = stock_zh_a_hist_df.loc['2023-12-19']
closed = stock_zh_a_hist_df['收盘']
closed
日期
2023-01-03    41.77
2023-01-04    42.63
2023-01-05    41.96
2023-01-06    42.08
2023-01-09    42.40
              ...  
2023-12-13    29.34
2023-12-14    29.17
2023-12-15    28.77
2023-12-18    28.49
2023-12-19    28.66
Name: 收盘, Length: 234, dtype: float64
# 获取特定日期的收盘价
closing_price_1 = selected_date_1['收盘']
closing_price_2 = selected_date_2['收盘']
closing_price_1
41.96
closing_price_2
28.66
change_rate = (closing_price_2/closing_price_1-1)*100
change_rate
-31.696854146806487

pct_change() 方法来计算相邻两个元素之间的百分比变化

returns = closed.pct_change()
ret_index = (1+returns).cumprod()
ret_index[0] = 1
ret_index
日期
2023-01-03    1.000000
2023-01-04    1.020589
2023-01-05    1.004549
2023-01-06    1.007422
2023-01-09    1.015083
                ...   
2023-12-13    0.702418
2023-12-14    0.698348
2023-12-15    0.688772
2023-12-18    0.682068
2023-12-19    0.686138
Name: 收盘, Length: 234, dtype: float64
# 计算指定日期内的累计收益
m_returns = ret_index.resample('BM').last()
m_returns
日期
2023-01-31    1.127364
2023-02-28    1.059852
2023-03-31    1.271008
2023-04-28    1.179555
2023-05-31    1.013886
2023-06-30    1.060330
2023-07-31    0.984439
2023-08-31    0.861863
2023-09-29    0.776873
2023-10-31    0.745032
2023-11-30    0.718937
2023-12-29    0.686138
Freq: BM, Name: 收盘, dtype: float64
m_returns.pct_change()
日期
2023-01-31         NaN
2023-02-28   -0.059885
2023-03-31    0.199232
2023-04-28   -0.071953
2023-05-31   -0.140451
2023-06-30    0.045809
2023-07-31   -0.071574
2023-08-31   -0.124514
2023-09-29   -0.098611
2023-10-31   -0.040986
2023-11-30   -0.035026
2023-12-29   -0.045621
Freq: BM, Name: 收盘, dtype: float64
m_returns['2023-08-31']
0.8618625807996175

6.分组变换和分析

7.分组因子暴露

在进行因子分析之前,需要明确你想要分析的因子是什么。因子可以是与股票价格变动相关的任何特征,比如财务指标、技术指标、市值等。通常,因子分析的目标是找到与股票收益变化相关的因子。

以下是一个简单的例子,假设你想分析的因子是每个股票的收益率(以百分比形式)。你可以使用pct_change方法计算每个股票的收益率,并将其作为因子进行分析:

import akshare as ak
import pandas as pd
import statsmodels.api as sm

# 定义股票代码列表
stock_symbols = ['600570', '600313']

# 初始化一个空的DataFrame来存储合并的数据
combined_df = pd.DataFrame()

# 为每个股票代码获取历史数据并将结果合并
for symbol in stock_symbols:
    # 获取每个股票的日线历史行情数据
    stock_data = ak.stock_zh_a_hist(symbol=symbol, period="daily", start_date="20230101", end_date='20231219', adjust="qfq")
    
    # 添加一个新列,用于存储股票代码
    stock_data['证券代码'] = symbol
    
    # 将每个股票的数据合并到总的DataFrame中
    combined_df = pd.concat([combined_df, stock_data])

# 计算每个股票的收益率并作为因子进行分析
combined_df['收益率'] = combined_df['收盘'].pct_change() * 100

# 移除缺失值
combined_df = combined_df.dropna()

# 将因子分析模型拟合到数据
X = sm.add_constant(combined_df['收益率'])  # 添加常数项
y = combined_df['涨跌幅']

model = sm.OLS(y, X).fit()
# 显示因子分析的结果
model.summary()
OLS Regression Results
Dep. Variable: 涨跌幅 R-squared: 0.326
Model: OLS Adj. R-squared: 0.325
Method: Least Squares F-statistic: 225.3
Date: Wed, 20 Dec 2023 Prob (F-statistic): 8.11e-42
Time: 11:28:06 Log-Likelihood: -921.04
No. Observations: 467 AIC: 1846.
Df Residuals: 465 BIC: 1854.
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const -0.0438 0.081 -0.542 0.588 -0.203 0.115
收益率 0.3328 0.022 15.011 0.000 0.289 0.376
Omnibus: 512.023 Durbin-Watson: 1.958
Prob(Omnibus): 0.000 Jarque-Bera (JB): 58498.105
Skew: 4.665 Prob(JB): 0.00
Kurtosis: 57.030 Cond. No. 3.66


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

因子暴露(Factor Exposure)是指股票对每个因子的敏感程度,即每个因子对于股票收益的贡献。可以通过回归分析来计算因子暴露。

import akshare as ak
import pandas as pd
import statsmodels.api as sm

# 定义股票代码列表
stock_symbols = ['600570', '600313']

# 初始化一个空的DataFrame来存储合并的数据
combined_df = pd.DataFrame()

# 为每个股票代码获取历史数据并将结果合并
for symbol in stock_symbols:
    # 获取每个股票的日线历史行情数据
    stock_data = ak.stock_zh_a_hist(symbol=symbol, period="daily", start_date="20230101", end_date='20231219', adjust="qfq")
    
    # 添加一个新列,用于存储股票代码
    stock_data['证券代码'] = symbol
    
    # 将每个股票的数据合并到总的DataFrame中
    combined_df = pd.concat([combined_df, stock_data])

# 计算每个股票的收益率和涨跌幅
combined_df['收益率'] = combined_df['收盘'].pct_change() * 100
combined_df['涨跌幅'] = combined_df['涨跌幅'].astype(float)

# 移除缺失值
combined_df = combined_df.dropna()

# 构建回归模型,使用涨跌幅作为因变量,收益率作为自变量
X = sm.add_constant(combined_df['收益率'])  # 添加常数项
y = combined_df['涨跌幅']

# 拟合回归模型
model = sm.OLS(y, X).fit()

# 提取因子暴露
factor_exposure = model.params['收益率']

# 显示因子暴露
print(f"因子暴露: {factor_exposure}")
因子暴露: 0.33282243711466647
import akshare as ak
import pandas as pd
import statsmodels.api as sm

# 定义股票代码列表
stock_symbols = ['600570', '600313']

# 初始化一个空的DataFrame来存储合并的数据
combined_df = pd.DataFrame()

# 为每个股票代码获取历史数据并将结果合并
for symbol in stock_symbols:
    # 获取每个股票的日线历史行情数据
    stock_data = ak.stock_zh_a_hist(symbol=symbol, period="daily", start_date="20230101", end_date='20231219', adjust="qfq")
    
    # 添加一个新列,用于存储股票代码
    stock_data['证券代码'] = symbol
    
    # 将每个股票的数据合并到总的DataFrame中
    combined_df = pd.concat([combined_df, stock_data])

# 计算每个股票的收益率、涨跌幅和成交量
combined_df['收益率'] = combined_df['收盘'].pct_change() * 100
combined_df['涨跌幅'] = combined_df['涨跌幅'].astype(float)
combined_df['成交量'] = combined_df['成交量'].astype(float)

# 移除缺失值
combined_df = combined_df.dropna()

# 构建多元回归模型,使用涨跌幅和成交量作为因变量,收益率作为自变量
X = sm.add_constant(combined_df[['收益率', '成交量']])  # 添加常数项
y = combined_df['涨跌幅']

# 拟合多元回归模型
model = sm.OLS(y, X).fit()

# 提取因子暴露
factor_exposure_returns = model.params['收益率']
factor_exposure_volume = model.params['成交量']

# 显示因子暴露
print(f"收益率因子暴露: {factor_exposure_returns}")
print(f"成交量因子暴露: {factor_exposure_volume}")
收益率因子暴露: 0.3172937471715912
成交量因子暴露: 2.146227651605541e-06
# 显示多元回归的结果
model.summary()
OLS Regression Results
Dep. Variable: 涨跌幅 R-squared: 0.361
Model: OLS Adj. R-squared: 0.358
Method: Least Squares F-statistic: 130.8
Date: Wed, 20 Dec 2023 Prob (F-statistic): 8.71e-46
Time: 13:09:59 Log-Likelihood: -908.88
No. Observations: 467 AIC: 1824.
Df Residuals: 464 BIC: 1836.
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const -0.5781 0.133 -4.341 0.000 -0.840 -0.316
收益率 0.3173 0.022 14.523 0.000 0.274 0.360
成交量 2.146e-06 4.31e-07 4.980 0.000 1.3e-06 2.99e-06
Omnibus: 454.093 Durbin-Watson: 1.926
Prob(Omnibus): 0.000 Jarque-Bera (JB): 47379.024
Skew: 3.792 Prob(JB): 0.00
Kurtosis: 51.758 Cond. No. 5.22e+05


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.22e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
import akshare as ak
import pandas as pd

# 定义股票代码列表
stock_symbols = ['600004']

# 根据需要的年份(例如2020年),筛选数据
desired_year = 2020

# 初始化一个空的DataFrame来存储合并的数据
stock_financial_analysis_indicator_df = pd.DataFrame()

# 为每个股票代码获取财务分析指标数据并将结果合并
for symbol in stock_symbols:
    # 获取每个股票的财务分析指标数据
    stock_financial_analysis_indicator_data = ak.stock_financial_analysis_indicator(symbol=symbol)
    
    # 根据指定年份筛选数据
    filtered_df = stock_financial_analysis_indicator_data[stock_financial_analysis_indicator_data['日期'] == desired_year]
    
    # 添加一个新列,用于存储股票代码
    filtered_df['证券代码'] = symbol
    
    # 将每个股票的数据合并到总的DataFrame中
    stock_financial_analysis_indicator_df = pd.concat([stock_financial_analysis_indicator_df, filtered_df])

# 显示合并后的DataFrame
stock_financial_analysis_indicator_df
日期 摊薄每股收益(元) 加权每股收益(元) 每股收益_调整后(元) 扣除非经常性损益后的每股收益(元) 每股净资产_调整前(元) 每股净资产_调整后(元) 每股经营性现金流(元) 每股资本公积金(元) 每股未分配利润(元) ... 3年以内应收帐款(元) 1年以内预付货款(元) 1-2年以内预付货款(元) 2-3年以内预付货款(元) 3年以内预付货款(元) 1年以内其它应收款(元) 1-2年以内其它应收款(元) 2-3年以内其它应收款(元) 3年以内其它应收款(元) 证券代码

0 rows × 87 columns

8.十分位和四分位分析

# 通过pandas.qcut和groupby进行分位数分析

8.1 接入行情

import akshare as ak
stock_zh_a_ist_df = ak.stock_zh_a_hist(symbol="600570", period="daily", start_date="20230101", end_date='20231219', adjust="qfq")
stock_zh_a_hist_df
日期 开盘 收盘 最高 最低 成交量 成交额 振幅 涨跌幅 涨跌额 换手率
0 2023-01-03 40.22 41.77 42.04 39.98 188785 7.876238e+08 5.11 3.57 1.44 0.99
1 2023-01-04 41.94 42.63 42.92 41.24 257842 1.088131e+09 4.02 2.06 0.86 1.36
2 2023-01-05 42.19 41.96 42.61 41.57 99206 4.184562e+08 2.44 -1.57 -0.67 0.52
3 2023-01-06 42.30 42.08 43.87 41.57 218465 9.297619e+08 5.48 0.29 0.12 1.15
4 2023-01-09 41.45 42.40 42.98 40.77 191544 8.106186e+08 5.25 0.76 0.32 1.01
... ... ... ... ... ... ... ... ... ... ... ...
229 2023-12-13 29.78 29.34 30.16 29.33 235863 7.014522e+08 2.79 -1.28 -0.38 1.24
230 2023-12-14 29.60 29.17 29.87 29.15 155482 4.586276e+08 2.45 -0.58 -0.17 0.82
231 2023-12-15 29.32 28.77 29.36 28.71 185556 5.368904e+08 2.23 -1.37 -0.40 0.98
232 2023-12-18 28.55 28.49 29.18 28.30 160454 4.597516e+08 3.06 -0.97 -0.28 0.84
233 2023-12-19 28.48 28.66 28.81 28.27 150101 4.282943e+08 1.90 0.60 0.17 0.79

234 rows × 11 columns

import akshare as ak

# 获取特定股票的历史行情数据
stock_zh_a_hist_df = ak.stock_zh_a_hist(
    symbol="600570",
    period="daily",
    start_date="20230101",
    end_date="20231219",
    adjust="qfq"
)

# 显示历史行情数据
stock_zh_a_hist_df
日期 开盘 收盘 最高 最低 成交量 成交额 振幅 涨跌幅 涨跌额 换手率
0 2023-01-03 40.22 41.77 42.04 39.98 188785 7.876238e+08 5.11 3.57 1.44 0.99
1 2023-01-04 41.94 42.63 42.92 41.24 257842 1.088131e+09 4.02 2.06 0.86 1.36
2 2023-01-05 42.19 41.96 42.61 41.57 99206 4.184562e+08 2.44 -1.57 -0.67 0.52
3 2023-01-06 42.30 42.08 43.87 41.57 218465 9.297619e+08 5.48 0.29 0.12 1.15
4 2023-01-09 41.45 42.40 42.98 40.77 191544 8.106186e+08 5.25 0.76 0.32 1.01
... ... ... ... ... ... ... ... ... ... ... ...
229 2023-12-13 29.78 29.34 30.16 29.33 235863 7.014522e+08 2.79 -1.28 -0.38 1.24
230 2023-12-14 29.60 29.17 29.87 29.15 155482 4.586276e+08 2.45 -0.58 -0.17 0.82
231 2023-12-15 29.32 28.77 29.36 28.71 185556 5.368904e+08 2.23 -1.37 -0.40 0.98
232 2023-12-18 28.55 28.49 29.18 28.30 160454 4.597516e+08 3.06 -0.97 -0.28 0.84
233 2023-12-19 28.48 28.66 28.81 28.27 150101 4.282943e+08 1.90 0.60 0.17 0.79

234 rows × 11 columns

8.2 计算日收益率

# 计算日收益率
stock_zh_a_hist_df['日收益率'] = stock_zh_a_hist_df['收盘'].pct_change()

# 显示包含日收益率的 DataFrame
stock_zh_a_hist_df[['日期', '收盘', '日收益率']]
日期 收盘 日收益率
0 2023-01-03 41.77 NaN
1 2023-01-04 42.63 0.020589
2 2023-01-05 41.96 -0.015717
3 2023-01-06 42.08 0.002860
4 2023-01-09 42.40 0.007605
... ... ... ...
229 2023-12-13 29.34 -0.012786
230 2023-12-14 29.17 -0.005794
231 2023-12-15 28.77 -0.013713
232 2023-12-18 28.49 -0.009732
233 2023-12-19 28.66 0.005967

234 rows × 3 columns

8.3 将收益率变换为趋势信号

# 编写将收益率变换为趋势信号的函数
def generate_trend_signal(returns, lookback, lag):
    signal = returns.rolling(window=lookback, min_periods=lookback-5).sum()
    return signal.shift(lag)

# 使用函数生成趋势信号
lookback_period = 100
lag_period = 3
stock_zh_a_hist_df['趋势信号'] = generate_trend_signal(stock_zh_a_hist_df['日收益率'], lookback_period, lag_period)

# 显示包含趋势信号的 DataFrame
stock_zh_a_hist_df[['日期', '收盘', '日收益率', '趋势信号']]
日期 收盘 日收益率 趋势信号
0 2023-01-03 41.77 NaN NaN
1 2023-01-04 42.63 0.020589 NaN
2 2023-01-05 41.96 -0.015717 NaN
3 2023-01-06 42.08 0.002860 NaN
4 2023-01-09 42.40 0.007605 NaN
... ... ... ... ...
229 2023-12-13 29.34 -0.012786 -0.200829
230 2023-12-14 29.17 -0.005794 -0.203740
231 2023-12-15 28.77 -0.013713 -0.236405
232 2023-12-18 28.49 -0.009732 -0.234316
233 2023-12-19 28.66 0.005967 -0.239846

234 rows × 4 columns

8.4 编写交易策略

根据每周五动量信号进行交易

import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']

# 根据每周五的趋势信号生成交易策略
def weekly_momentum_strategy(df, signal_column='趋势信号'):
    # 策略规则:如果趋势信号大于0,则持有股票;否则,不持有股票
    # 如果 df['趋势信号'] 中的元素大于 0,则对应位置的 '信号' 列的值为 1。
    # 如果 df['趋势信号'] 中的元素不大于 0,则对应位置的 '信号' 列的值为 0。
    df['信号'] = np.where(df[signal_column] > 0, 1, 0)
    
    # 计算每日收益率
    df['日收益率'] = df['收盘'].pct_change()
    
    # 计算策略的每日收益率
    df['策略收益率'] = df['Position'].shift(1) * df['日收益率']
    
    # 计算策略的累积收益率
    df['累计收益率'] = (1 + df['策略收益率']).cumprod()
    
    return df

# 应用策略并生成收益指数
strategy_df = weekly_momentum_strategy(stock_zh_a_hist_df)

# 显示包含策略结果的 DataFrame
strategy_df[['日期', '收盘', '趋势信号', '信号', '日收益率', '策略收益率', '累计收益率']]
日期 收盘 趋势信号 信号 日收益率 策略收益率 累计收益率
0 2023-01-03 41.77 NaN 0 NaN NaN NaN
1 2023-01-04 42.63 NaN 0 0.020589 0.0 1.000000
2 2023-01-05 41.96 NaN 0 -0.015717 -0.0 1.000000
3 2023-01-06 42.08 NaN 0 0.002860 0.0 1.000000
4 2023-01-09 42.40 NaN 0 0.007605 0.0 1.000000
... ... ... ... ... ... ... ...
229 2023-12-13 29.34 -0.200829 0 -0.012786 -0.0 0.994311
230 2023-12-14 29.17 -0.203740 0 -0.005794 -0.0 0.994311
231 2023-12-15 28.77 -0.236405 0 -0.013713 -0.0 0.994311
232 2023-12-18 28.49 -0.234316 0 -0.009732 -0.0 0.994311
233 2023-12-19 28.66 -0.239846 0 0.005967 0.0 0.994311

234 rows × 7 columns

# 绘制收益指数图表
plt.figure(figsize=(10, 6))
plt.plot(strategy_df['日期'], strategy_df['累计收益率'], label='累计收益率')
plt.title('每周五动量信号交易策略累计收益率')
plt.xlabel('日期')
plt.ylabel('累计收益率')
plt.legend()
plt.show()

image