Factor Returns Computation Validation#
In this tutorial we are going to compute factor returns from a standard Bayesline factor model and compare the results to those we manually compute with statsmodels. The specific steps we will follow are:
Create a basic risk model and extract factor returns.
Manually compute factor returns with
statsmodels.Compare results of extracted factor returns and
statsmodelsregression.
Throughout this notebook we work with a randomly generated dataset. The results should generalize to real data, but we do not show any real data on our public API. Bayesline clients can run this notebook on real data.
0. Imports & Setup#
For this tutorial notebook, you will need to import the following packages.
import numpy as np
import numpy.testing as tst
import pandas as pd
from statsmodels.api import WLS
from statsmodels.stats.weightstats import DescrStatsW
from bayesline.api.equity import (
CategoricalExposureGroupSettings,
CategoricalFilterSettings,
ContinuousExposureGroupSettings,
ExposureSettings,
FactorRiskModelSettings,
ModelConstructionSettings,
UniverseSettings,
)
from bayesline.apiclient import BayeslineApiClient
We will also need to have a Bayesline API client configured.
bln = BayeslineApiClient.new_client(
endpoint="https://[ENDPOINT]",
api_key="[API-KEY]",
)
1. Create a basic risk model and extract factor returns#
Set up the risk model settings#
We will set up a risk model to use InvIdioVar weights and otherwise default settings.
factorriskmodel_settings = FactorRiskModelSettings(
universe=UniverseSettings(dataset="Bayesline-US-500-1y"),
exposures=ExposureSettings(
exposures=[
ContinuousExposureGroupSettings(hierarchy="market"),
CategoricalExposureGroupSettings(hierarchy="trbc"),
ContinuousExposureGroupSettings(
hierarchy="style", standardize_method="equal_weighted"
),
],
),
modelconstruction=ModelConstructionSettings(
weights="InvIdioVar",
estimation_universe=UniverseSettings(
dataset="Bayesline-US-500-1y",
categorical_filters=[
CategoricalFilterSettings(hierarchy="estimation_universe")
],
),
return_clip_bounds=(None, None),
zero_sum_constraints={"trbc": "mcap_weighted"},
),
)
Let’s verify the risk model settings we configured above.
print(factorriskmodel_settings.model_dump_json(indent=2))
{
"universe": [
{
"dataset": "Bayesline-US-500-1y",
"id_type": "bayesid",
"calendar": {
"dataset": "Bayesline-US-500-1y",
"filters": [
[
"XNYS"
]
]
},
"categorical_filters": [],
"portfolio_filter": null,
"mcap_filter": {
"lower": 0.0,
"upper": 1e20
}
}
],
"exposures": [
{
"exposures": [
{
"exposure_type": "continuous",
"hierarchy": {
"hierarchy_type": "level",
"name": "market",
"level": 1
},
"factor_group": "market",
"include": "All",
"exclude": [],
"standardize_method": "none"
},
{
"exposure_type": "categorical",
"hierarchy": {
"hierarchy_type": "level",
"name": "trbc",
"level": 1
},
"factor_group": "trbc",
"include": "All",
"exclude": []
},
{
"exposure_type": "continuous",
"hierarchy": {
"hierarchy_type": "level",
"name": "style",
"level": 1
},
"factor_group": "style",
"include": "All",
"exclude": [],
"standardize_method": "equal_weighted"
}
]
}
],
"modelconstruction": [
{
"currency": "USD",
"weights": "InvIdioVar",
"estimation_universe": {
"dataset": "Bayesline-US-500-1y",
"id_type": "bayesid",
"calendar": {
"dataset": "Bayesline-US-500-1y",
"filters": [
[
"XNYS"
]
]
},
"categorical_filters": [
{
"hierarchy": "estimation_universe",
"include": "All",
"exclude": []
}
],
"portfolio_filter": null,
"mcap_filter": {
"lower": 0.0,
"upper": 1e20
}
},
"return_clip_bounds": [
null,
null
],
"thin_category_shrinkage": {},
"thin_category_shrinkage_overrides": {},
"zero_sum_constraints": {
"trbc": "mcap_weighted"
},
"known_factor_map": {},
"fx_convert_returns": true
}
],
"halflife_idio_vra": null
}
Construct risk model with settings#
risk_model = bln.equity.riskmodels.load(factorriskmodel_settings).get_model()
dataset = bln.equity.riskdatasets.load("Bayesline-US-500-1y")
dataset.describe().exposure_settings_menu.continuous_hierarchies
{'market': ['market'],
'style': {'size': ['log_market_cap', 'log_total_assets'],
'value': ['book_to_price'],
'growth': ['price_to_earnings'],
'volatility': ['sigma', 'sigma_eps', 'beta'],
'momentum': ['mom6', 'mom12'],
'dividend': ['dividend_yield'],
'leverage': ['debt_to_assets', 'debt_to_equity']},
'other': ['risk_free_rate', 'international_return']}
Extract factor returns from risk model#
factor_returns = (
(risk_model.fret())
.to_pandas()
.set_index("date")
)
factor_returns.columns = pd.MultiIndex.from_tuples(
factor_returns.columns.str.split(".").to_list(), names=["factor_group", "factor"]
)
Let’s take a peek at the factor returns that our risk model computed. We have several groups of factors (market, industry, region, and style), and one or two factors in each group.
factor_returns.head()
| factor_group | market | style | trbc | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| factor | Market | Dividend | Growth | Leverage | Momentum | Size | Value | Volatility | Academic & Educational Services | Basic Materials | ... | Consumer Non-Cyclicals | Energy | Financials | Government Activity | Healthcare | Industrials | Institutions, Associations & Organizations | Real Estate | Technology | Utilities |
| date | |||||||||||||||||||||
| 2025-03-31 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 |
| 2025-04-01 | 0.002389 | -0.001670 | -0.000164 | -0.000203 | 0.001727 | -0.000717 | -0.002186 | 0.003468 | 0.0 | 0.005279 | ... | 0.005538 | 0.009317 | 0.000243 | 0.0 | -0.013778 | 0.002317 | 0.0 | 0.001743 | 0.000355 | 0.005273 |
| 2025-04-02 | 0.010604 | -0.000226 | 0.001085 | 0.002098 | -0.000473 | -0.002319 | 0.000226 | 0.006432 | 0.0 | 0.000034 | ... | 0.002944 | -0.003857 | 0.004190 | 0.0 | 0.001042 | 0.000536 | 0.0 | -0.002076 | -0.002568 | -0.002712 |
| 2025-04-03 | -0.056701 | -0.004549 | 0.000529 | -0.003550 | 0.006523 | 0.003314 | -0.003437 | -0.024596 | 0.0 | 0.006522 | ... | -0.004207 | -0.013785 | -0.008240 | 0.0 | 0.025792 | -0.006996 | 0.0 | 0.012528 | -0.003335 | 0.037673 |
| 2025-04-04 | -0.059780 | 0.000776 | -0.003235 | -0.000035 | -0.006187 | -0.003400 | -0.000313 | -0.013056 | 0.0 | -0.002142 | ... | 0.010029 | -0.031940 | -0.009083 | 0.0 | -0.002396 | -0.002323 | 0.0 | 0.004590 | -0.003216 | 0.000804 |
5 rows × 21 columns
2. Manually compute factor returns with statsmodels#
Now, we will manually compute factor returns for numerical comparison to our results.
Write manual regression code#
This is our implementation of a basic factor returns regression with statsmodels which we will use for comparison. This implementation uses weighted constrained linear regression to model factor returns. Specifically, it solves the following optimization problem to compute factor returns \(f_t\)
where \(j\in\mathrm{ind}\) are all industry factors and \(W_{i,t}\) are the weights of each stock. The constraint effectively says that the (market-cap weighted) industry factor returns sum to zero.
We use weighted linear regression, and from an econometric perspective, the optimal regression weights are (proportional to) the inverses of the variance, \(W_{i,t}=1/\mathrm{Var}(\varepsilon_{i,t})\). Since this is not known, we use the idiosyncratic volatility, or the estimated error variance of a 100-day rolling time-series regression of the returns of each stock against its market factor, \(W_{i,t}=100/\sum_{\tau=1}^{100}{e_{i,t-\tau}^2}\), for the fitted regression,
where the market return is simply the market-cap weighted return of all assets in the estimation universe, \(r_{t}^M=\sum_{i\in\mathcal{I}_t^E}{\mathrm{MCap}_{i,t}r_{i,t}}\).
def statsmodels_regression(
df: pd.DataFrame,
market_caps: pd.DataFrame,
industry_names: list[str],
substyle_names: list[str],
drop_ind: str | None = None,
) -> pd.DataFrame:
df = df.dropna()
df = df[df["estimation_universe"]]
factor_names = ["Market", *industry_names, *substyle_names]
weights = (1 / df["idio_vol"] ** 2).fillna(0.0)
X = df[factor_names].copy().astype(np.float32)
y = df["return"]
# compute the adjustment for industry exposure summing to zero
# drop a fixed industry with non-zero mcap
if drop_ind is None:
drop_ind = industry_names[market_caps.values.argmax()]
drop_idx = industry_names.index(drop_ind)
keep_inds = [n for n in industry_names if n != drop_ind]
adj = market_caps.values[np.arange(len(industry_names)) != drop_idx] / market_caps.values[drop_idx]
X.loc[:, keep_inds] -= adj * X.loc[:, [drop_ind]].values
X = X.drop(columns=drop_ind)
wls = WLS(
endog=y,
exog=X,
weights=weights,
missing="drop",
hasconst=False, # for r-squared calculation
).fit()
sigma2_eps = DescrStatsW(wls.resid, weights=weights, ddof=0).var
# if the mcap is zero, then t_stats are 0.0 and p_values are 1.0
zero_mcap = [i.replace("trbc.", "") for i in market_caps.loc[market_caps == 0.0].index]
wls.tvalues[zero_mcap] = 0.0
wls.pvalues[zero_mcap] = 1.0
wls_results = pd.concat(
{
"factor_returns": wls.params,
"t_stats": wls.tvalues,
"p_values": wls.pvalues,
"r2": pd.Series([wls.rsquared], [None]),
"sigma2": pd.Series(sigma2_eps, [None]),
},
)
return wls_results
Extract the required regression input data to compute factor returns#
A. Exposures#
First, let’s get the exposures for this universe as a pandas DataFrame.
X = risk_model.exposures().to_pandas()
X.head()
| date | bayesid | market.Market | style.Dividend | style.Growth | style.Leverage | style.Momentum | style.Size | style.Value | style.Volatility | ... | trbc.Consumer Non-Cyclicals | trbc.Energy | trbc.Financials | trbc.Government Activity | trbc.Healthcare | trbc.Industrials | trbc.Institutions, Associations & Organizations | trbc.Real Estate | trbc.Technology | trbc.Utilities | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2025-03-31 | IC006CA2E0 | 1.0 | 0.173330 | 0.161842 | -0.272641 | 0.720320 | 1.527282 | 0.233148 | -0.305778 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 2025-03-31 | IC01040E7C | 1.0 | 0.456846 | -2.246155 | -0.630698 | 1.176452 | -0.480233 | 1.628404 | -0.312413 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 2025-03-31 | IC0121F541 | 1.0 | -0.797155 | 0.586450 | 2.492548 | 0.396647 | -1.147491 | -1.225354 | -1.179120 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 2025-03-31 | IC012D373B | 1.0 | -0.539123 | -0.090887 | 0.862522 | -0.881848 | -0.072703 | -0.299995 | 0.765667 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 2025-03-31 | IC01430182 | 1.0 | -1.214624 | 0.295274 | -1.590514 | -0.584077 | -0.682563 | -0.831440 | 0.890888 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 23 columns
Next, we’ll clean up our exposures dataframe slightly and massage it into the desired shape and structure.
# rename to be cleaner names corresponding to df
X.columns = [col.split('.')[-1] if '.' in col else col for col in X.columns]
X['date'] = pd.to_datetime(X["date"])
exposures = X.set_index(["date", "bayesid"])
exposures.head()
| Market | Dividend | Growth | Leverage | Momentum | Size | Value | Volatility | Academic & Educational Services | Basic Materials | ... | Consumer Non-Cyclicals | Energy | Financials | Government Activity | Healthcare | Industrials | Institutions, Associations & Organizations | Real Estate | Technology | Utilities | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | bayesid | |||||||||||||||||||||
| 2025-03-31 | IC006CA2E0 | 1.0 | 0.173330 | 0.161842 | -0.272641 | 0.720320 | 1.527282 | 0.233148 | -0.305778 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| IC01040E7C | 1.0 | 0.456846 | -2.246155 | -0.630698 | 1.176452 | -0.480233 | 1.628404 | -0.312413 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| IC0121F541 | 1.0 | -0.797155 | 0.586450 | 2.492548 | 0.396647 | -1.147491 | -1.225354 | -1.179120 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | |
| IC012D373B | 1.0 | -0.539123 | -0.090887 | 0.862522 | -0.881848 | -0.072703 | -0.299995 | 0.765667 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | |
| IC01430182 | 1.0 | -1.214624 | 0.295274 | -1.590514 | -0.584077 | -0.682563 | -0.831440 | 0.890888 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 21 columns
B. Asset Returns#
Next, we get the returns of our asset universe across dates. Note that some values are NaN since our estimation universe does not include all the securities in the larger universe across all dates.
returns = (
risk_model.future_asset_returns()
.to_pandas()
.set_index("date")
.stack(dropna=False)
.rename("return")
.to_frame()
)
returns.head()
| return | ||
|---|---|---|
| date | ||
| 2025-03-31 | IC006CA2E0 | -0.002536 |
| IC01040E7C | 0.012756 | |
| IC0121F541 | 0.003326 | |
| IC012D373B | 0.008441 | |
| IC01430182 | -0.012118 |
C. Idiosyncractic Volatility#
idio_vol = (
risk_model.weights()
.to_pandas()
.set_index("date")
.stack()
.rename("idio_vol")
.to_frame()
)
idio_vol.head()
| idio_vol | ||
|---|---|---|
| date | ||
| 2025-03-31 | IC006CA2E0 | 0.011579 |
| IC01040E7C | 0.015426 | |
| IC0121F541 | 0.011306 | |
| IC012D373B | 0.017491 | |
| IC01430182 | 0.027865 |
D. Estimation Universe#
estimation_universe = (
risk_model.estimation_universe()
.to_pandas()
.set_index("date")
.stack()
.rename("estimation_universe")
.astype('bool')
.to_frame()
)
estimation_universe.head()
| estimation_universe | ||
|---|---|---|
| date | ||
| 2025-03-31 | IC006CA2E0 | True |
| IC01040E7C | True | |
| IC0121F541 | True | |
| IC012D373B | True | |
| IC01430182 | True |
Join all the regression components#
df_all = pd.concat(
[exposures, returns, idio_vol, estimation_universe],
axis=1
)
key = pd.MultiIndex.from_product(
[sorted(X['date'].unique()), sorted(X['bayesid'].unique())],
)
df_all = df_all.reindex(key)
df_all[["estimation_universe"]] = df_all[["estimation_universe"]].fillna(False)
df_all.head()
| Market | Dividend | Growth | Leverage | Momentum | Size | Value | Volatility | Academic & Educational Services | Basic Materials | ... | Government Activity | Healthcare | Industrials | Institutions, Associations & Organizations | Real Estate | Technology | Utilities | return | idio_vol | estimation_universe | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2025-03-31 | IC006CA2E0 | 1.0 | 0.173330 | 0.161842 | -0.272641 | 0.720320 | 1.527282 | 0.233148 | -0.305778 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.002536 | 0.011579 | True |
| IC01040E7C | 1.0 | 0.456846 | -2.246155 | -0.630698 | 1.176452 | -0.480233 | 1.628404 | -0.312413 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.012756 | 0.015426 | True | |
| IC0121F541 | 1.0 | -0.797155 | 0.586450 | 2.492548 | 0.396647 | -1.147491 | -1.225354 | -1.179120 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.003326 | 0.011306 | True | |
| IC012D373B | 1.0 | -0.539123 | -0.090887 | 0.862522 | -0.881848 | -0.072703 | -0.299995 | 0.765667 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.008441 | 0.017491 | True | |
| IC01430182 | 1.0 | -1.214624 | 0.295274 | -1.590514 | -0.584077 | -0.682563 | -0.831440 | 0.890888 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.012118 | 0.027865 | True |
5 rows × 24 columns
Extract Aggregated Market Cap#
We also need market cap data, but this comes in the form of already industry-aggregated market caps, so cannot be merged with the rest of the input data as it does not span across assets. We only need the market caps corresponding to our industry exposures, so we can filter out any other columns.
# get the market caps
market_caps = (
risk_model.market_caps()
.to_pandas()
.set_index("date")
.reindex(df_all.index.get_level_values(0).unique())
)
# get all industry names and reformat strings from factor returns
industries = factor_returns.loc[:,'trbc'].columns.to_list()
industries_long = [f"trbc.{i}" for i in industries]
# filter market caps to industries
market_caps = market_caps[industries_long]
# pick the industry with the largest mcap to drop (must be non-zero for constraint)
drop_ind = industries[market_caps.dropna().iloc[0].values.argmax()]
Drop weekly and forward fill#
Since factor returns tend to be stable over short periods of time, we only compute the regression every week on Wednesdays. Thus, we will drop all data on non-Wednesdays and forward fill.
# slice on reset day (every wednesday) and fill forward
forward_fill_cols = [*exposures.columns, "estimation_universe", "idio_vol"]
# fill returns
df_all.loc[df_all["estimation_universe"], "return"] = df_all.loc[
df_all["estimation_universe"],
"return",
].fillna(0.0)
df_all.head()
| Market | Dividend | Growth | Leverage | Momentum | Size | Value | Volatility | Academic & Educational Services | Basic Materials | ... | Government Activity | Healthcare | Industrials | Institutions, Associations & Organizations | Real Estate | Technology | Utilities | return | idio_vol | estimation_universe | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2025-03-31 | IC006CA2E0 | 1.0 | 0.173330 | 0.161842 | -0.272641 | 0.720320 | 1.527282 | 0.233148 | -0.305778 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.002536 | 0.011579 | True |
| IC01040E7C | 1.0 | 0.456846 | -2.246155 | -0.630698 | 1.176452 | -0.480233 | 1.628404 | -0.312413 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.012756 | 0.015426 | True | |
| IC0121F541 | 1.0 | -0.797155 | 0.586450 | 2.492548 | 0.396647 | -1.147491 | -1.225354 | -1.179120 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.003326 | 0.011306 | True | |
| IC012D373B | 1.0 | -0.539123 | -0.090887 | 0.862522 | -0.881848 | -0.072703 | -0.299995 | 0.765667 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.008441 | 0.017491 | True | |
| IC01430182 | 1.0 | -1.214624 | 0.295274 | -1.590514 | -0.584077 | -0.682563 | -0.831440 | 0.890888 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.012118 | 0.027865 | True |
5 rows × 24 columns
market_caps = market_caps.ffill()
market_caps.index.name = "date"
market_caps.head()
| trbc.Academic & Educational Services | trbc.Basic Materials | trbc.Consumer Cyclicals | trbc.Consumer Non-Cyclicals | trbc.Energy | trbc.Financials | trbc.Government Activity | trbc.Healthcare | trbc.Industrials | trbc.Institutions, Associations & Organizations | trbc.Real Estate | trbc.Technology | trbc.Utilities | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||
| 2025-03-31 | 0.0 | 9.099311e+11 | 6.052074e+12 | 4.254438e+12 | 2.075624e+12 | 5.317734e+12 | 0.0 | 5.467851e+12 | 4.553074e+12 | 0.0 | 1.009778e+12 | 2.152182e+13 | 1.246728e+12 |
| 2025-04-01 | 0.0 | 9.125518e+11 | 6.115045e+12 | 4.266101e+12 | 2.088907e+12 | 5.315932e+12 | 0.0 | 5.369934e+12 | 4.569521e+12 | 0.0 | 1.011223e+12 | 2.173725e+13 | 1.253175e+12 |
| 2025-04-02 | 0.0 | 9.193467e+11 | 6.238026e+12 | 4.270846e+12 | 2.091793e+12 | 5.380981e+12 | 0.0 | 5.405962e+12 | 4.603137e+12 | 0.0 | 1.015697e+12 | 2.184620e+13 | 1.262476e+12 |
| 2025-04-03 | 0.0 | 8.799727e+11 | 5.846813e+12 | 4.254830e+12 | 1.944970e+12 | 5.035908e+12 | 0.0 | 5.362670e+12 | 4.373734e+12 | 0.0 | 9.873048e+11 | 2.048757e+13 | 1.246536e+12 |
| 2025-04-04 | 0.0 | 8.237540e+11 | 5.571600e+12 | 4.024545e+12 | 1.775897e+12 | 4.676706e+12 | 0.0 | 5.063200e+12 | 4.084828e+12 | 0.0 | 9.423221e+11 | 1.924078e+13 | 1.174405e+12 |
Run statsmodels regression to compute factor returns#
Now, we will take all the components we created above and run our manual regression function to compute factor returns.
# get trade days from the universe
trade_days = risk_model.universe().to_pandas().set_index("date").index
# get all style names factor returns
styles = factor_returns.loc[:,'style'].columns.to_list()
# perform regression on each date
manual_computation = {
g: statsmodels_regression(df, mcap, industries, styles, drop_ind=drop_ind)
for (g, df), (_, mcap) in zip(df_all.groupby(level=0), market_caps.iterrows(), strict=True)
if g in trade_days
}
manual_computation = (
pd.concat(manual_computation, axis=1, names=["date"])
.T.reindex(trade_days)
.shift(1)
.tail(-1)
)
manual_computation.head()
| factor_returns | ... | p_values | r2 | sigma2 | |||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Market | Academic & Educational Services | Basic Materials | Consumer Cyclicals | Consumer Non-Cyclicals | Energy | Financials | Government Activity | Healthcare | Industrials | ... | Utilities | Dividend | Growth | Leverage | Momentum | Size | Value | Volatility | NaN | NaN | |
| date | |||||||||||||||||||||
| 2025-04-01 | 0.002389 | -3.302413e-19 | 0.005279 | -0.000030 | 0.005538 | 0.009317 | 0.000243 | -3.854565e-19 | -0.013778 | 0.002317 | ... | 8.380799e-03 | 0.002746 | 0.795168 | 0.685097 | 9.817810e-04 | 0.124323 | 0.000125 | 8.969397e-10 | 0.359819 | 0.000077 |
| 2025-04-02 | 0.010604 | -3.759340e-18 | 0.000034 | 0.004330 | 0.002944 | -0.003857 | 0.004190 | 7.637193e-19 | 0.001042 | 0.000536 | ... | 1.779818e-01 | 0.688133 | 0.086282 | 0.000038 | 3.643193e-01 | 0.000001 | 0.692299 | 2.008621e-27 | 0.560441 | 0.000079 |
| 2025-04-03 | -0.056701 | 1.176817e-17 | 0.006522 | -0.001524 | -0.004207 | -0.013785 | -0.008240 | 6.531913e-18 | 0.025792 | -0.006996 | ... | 1.142414e-07 | 0.020713 | 0.809499 | 0.043077 | 3.781731e-04 | 0.045851 | 0.083610 | 3.180982e-32 | 0.718866 | 0.000945 |
| 2025-04-04 | -0.059780 | -1.887768e-17 | -0.002142 | 0.025728 | 0.010029 | -0.031940 | -0.009083 | 1.367395e-18 | -0.002396 | -0.002323 | ... | 8.573057e-01 | 0.533975 | 0.018148 | 0.975308 | 3.165371e-07 | 0.001423 | 0.802865 | 3.824011e-24 | 0.890849 | 0.000383 |
| 2025-04-07 | -0.002711 | 4.822107e-18 | -0.007072 | -0.014569 | 0.011582 | -0.005765 | 0.001255 | 1.247525e-17 | 0.002762 | -0.002190 | ... | 3.915492e-02 | 0.850834 | 0.291707 | 0.981236 | 9.212254e-04 | 0.415403 | 0.260029 | 8.989382e-17 | 0.392184 | 0.000249 |
5 rows × 62 columns
3. Compare results of extracted factor returns and statsmodels regression.#
In order to perform a complete comparison, we take our factor returns and check that all of the following match the statsmodels regression computation.
factor returns
t-stats
p-values
Compare factor returns#
# drop the constrained industry and zero-mcap industries from both sides
# API returns zero for industries with no assets in estimation universe
trbc_frets = factor_returns.loc[:, 'trbc'].iloc[1:] # skip burn-in
zero_mcap_inds = trbc_frets.columns[(trbc_frets == 0).all()].tolist()
drop_from_comparison = [drop_ind] + zero_mcap_inds
# API factor returns: skip burn-in first row, drop constrained + zero-mcap industries
frets_actual = factor_returns.iloc[1:].droplevel('factor_group', axis=1).drop(
columns=drop_from_comparison, errors='ignore'
)
# Manual factor returns: drop zero-mcap industries
frets_expect = manual_computation['factor_returns'].drop(
columns=zero_mcap_inds, errors='ignore'
)
# Compare on common dates (skip first degenerate date)
common_dates = frets_actual.index.intersection(frets_expect.index)[1:]
pd.testing.assert_frame_equal(
frets_actual.loc[common_dates],
frets_expect.loc[common_dates],
check_names=False,
check_dtype=False,
check_like=True,
atol=1e-7,
)
Compare t-stats#
t_stats = risk_model.t_stats().to_pandas().set_index('date').iloc[1:]
t_stats.columns = [col.split('.')[-1] if '.' in col else col for col in t_stats.columns]
t_stats = t_stats.drop(columns=drop_from_comparison, errors='ignore')
t_stats_manual = manual_computation['t_stats'].drop(
columns=zero_mcap_inds, errors='ignore'
)
pd.testing.assert_frame_equal(
t_stats.loc[common_dates],
t_stats_manual.loc[common_dates],
check_names=False,
check_dtype=False,
check_like=True,
atol=1e-4,
)
Compare p-values#
p_values = risk_model.p_values().to_pandas().set_index('date').iloc[1:]
p_values.columns = [col.split('.')[-1] if '.' in col else col for col in p_values.columns]
p_values = p_values.drop(columns=drop_from_comparison, errors='ignore')
p_values_manual = manual_computation['p_values'].drop(
columns=zero_mcap_inds, errors='ignore'
)
# compare p-values in different ranges with different precisions
tst.assert_array_almost_equal(
p_values.loc[common_dates].values.clip(None, 0.05),
p_values_manual.loc[common_dates].reindex(columns=p_values.columns).values.clip(None, 0.05),
decimal=4,
)
tst.assert_array_almost_equal(
p_values.loc[common_dates].values.clip(0.05, 0.9),
p_values_manual.loc[common_dates].reindex(columns=p_values.columns).values.clip(0.05, 0.9),
decimal=3,
)
tst.assert_array_almost_equal(
p_values.loc[common_dates].values.clip(0.9, None),
p_values_manual.loc[common_dates].reindex(columns=p_values.columns).values.clip(0.9, None),
decimal=2,
)