Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Purge bad Yahoo data from requests_cache #1204

Draft
wants to merge 8 commits into
base: dev
Choose a base branch
from
36 changes: 23 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,19 +154,6 @@ msft.option_chain(..., proxy="PROXY_SERVER")
...
```

To use a custom `requests` session (for example to cache calls to the
API or customize the `User-agent` header), pass a `session=` argument to
the Ticker constructor.

```python
import requests_cache
session = requests_cache.CachedSession('yfinance.cache')
session.headers['User-agent'] = 'my-program/1.0'
ticker = yf.Ticker('msft', session=session)
# The scraped response will be stored in the cache
ticker.actions
```

To initialize multiple `Ticker` objects, use

```python
Expand All @@ -180,6 +167,29 @@ tickers.tickers['AAPL'].history(period="1mo")
tickers.tickers['GOOG'].actions
```

### Caching

Heavy users will quickly encounter Yahoo's rate limits on free use.
A `requests` session can help by caching web requests.
To use, pass a `session=` argument to the Ticker constructor:

```python
import requests_cache
session = requests_cache.CachedSession('yfinance.cache')
# session.headers['User-agent'] = 'my-program/1.0' # <- Optional
ticker = yf.Ticker('msft aapl goog', session=session)
# The scraped response will be stored in the cache
ticker.actions
```
To assist, `yfinance` removes requests from cache that failed to parse.
To disable this feature call `yfinance.disable_prune_session_cache()`.

Add expiration to the session to prune old data:
```python
session = requests_cache.CachedSession('yfinance.cache', expire_after=datetime.timedelta(minutes=60))
```
More info here: https://requests-cache.readthedocs.io/en/stable/user_guide/expiration.html

### Fetching data for multiple tickers

```python
Expand Down
65 changes: 34 additions & 31 deletions tests/prices.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,25 @@ def test_daily_index(self):
tkrs = ["BHP.AX", "IMP.JO", "BP.L", "PNL.L", "INTC"]
intervals = ["1d", "1wk", "1mo"]
for tkr in tkrs:
dat = yf.Ticker(tkr, session=self.session)
ticker = yf.Ticker(tkr, session=self.session)

for interval in intervals:
df = dat.history(period="5y", interval=interval)
df = ticker.history(period="5y", interval=interval)

f = df.index.time == _dt.time(0)
self.assertTrue(f.all())

def test_duplicatingHourly(self):
tkrs = ["IMP.JO", "BHG.JO", "SSW.JO", "BP.L", "INTC"]
for tkr in tkrs:
dat = yf.Ticker(tkr, session=self.session)
tz = dat._get_ticker_tz(debug_mode=False, proxy=None, timeout=None)
ticker = yf.Ticker(tkr, session=self.session)
tz = ticker._get_ticker_tz(debug_mode=False, proxy=None, timeout=None)

dt_utc = _tz.timezone("UTC").localize(_dt.datetime.utcnow())
dt = dt_utc.astimezone(_tz.timezone(tz))

start_d = dt.date() - _dt.timedelta(days=7)
df = dat.history(start=start_d, interval="1h")
df = ticker.history(start=start_d, interval="1h")

dt0 = df.index[-2]
dt1 = df.index[-1]
Expand All @@ -57,16 +58,16 @@ def test_duplicatingDaily(self):
tkrs = ["IMP.JO", "BHG.JO", "SSW.JO", "BP.L", "INTC"]
test_run = False
for tkr in tkrs:
dat = yf.Ticker(tkr, session=self.session)
tz = dat._get_ticker_tz(debug_mode=False, proxy=None, timeout=None)
ticker = yf.Ticker(tkr, session=self.session)
tz = ticker._get_ticker_tz(debug_mode=False, proxy=None, timeout=None)

dt_utc = _tz.timezone("UTC").localize(_dt.datetime.utcnow())
dt = dt_utc.astimezone(_tz.timezone(tz))
if dt.time() < _dt.time(17, 0):
continue
test_run = True

df = dat.history(start=dt.date() - _dt.timedelta(days=7), interval="1d")
df = ticker.history(start=dt.date() - _dt.timedelta(days=7), interval="1d")

dt0 = df.index[-2]
dt1 = df.index[-1]
Expand All @@ -83,15 +84,15 @@ def test_duplicatingWeekly(self):
tkrs = ['MSFT', 'IWO', 'VFINX', '^GSPC', 'BTC-USD']
test_run = False
for tkr in tkrs:
dat = yf.Ticker(tkr, session=self.session)
tz = dat._get_ticker_tz(debug_mode=False, proxy=None, timeout=None)
ticker = yf.Ticker(tkr, session=self.session)
tz = ticker._get_ticker_tz(debug_mode=False, proxy=None, timeout=None)

dt = _tz.timezone(tz).localize(_dt.datetime.now())
if dt.date().weekday() not in [1, 2, 3, 4]:
continue
test_run = True

df = dat.history(start=dt.date() - _dt.timedelta(days=7), interval="1wk")
df = ticker.history(start=dt.date() - _dt.timedelta(days=7), interval="1wk")
dt0 = df.index[-2]
dt1 = df.index[-1]
try:
Expand Down Expand Up @@ -246,16 +247,16 @@ def test_dst_fix(self):
# The correction is successful if no days are weekend, and weekly data begins Monday

tkr = "AGRO3.SA"
dat = yf.Ticker(tkr, session=self.session)
ticker = yf.Ticker(tkr, session=self.session)
start = "2021-01-11"
end = "2022-11-05"

interval = "1d"
df = dat.history(start=start, end=end, interval=interval)
df = ticker.history(start=start, end=end, interval=interval)
self.assertTrue(((df.index.weekday >= 0) & (df.index.weekday <= 4)).all())

interval = "1wk"
df = dat.history(start=start, end=end, interval=interval)
df = ticker.history(start=start, end=end, interval=interval)
try:
self.assertTrue((df.index.weekday == 0).all())
except:
Expand Down Expand Up @@ -377,8 +378,8 @@ def test_weekly_2rows_fix(self):
start = _dt.date.today() - _dt.timedelta(days=14)
start -= _dt.timedelta(days=start.weekday())

dat = yf.Ticker(tkr)
df = dat.history(start=start, interval="1wk")
ticker = yf.Ticker(tkr)
df = ticker.history(start=start, interval="1wk")
self.assertTrue((df.index.weekday == 0).all())

class TestPriceRepair(unittest.TestCase):
Expand Down Expand Up @@ -417,7 +418,7 @@ def test_repair_100x_weekly(self):
# Setup:
tkr = "PNL.L"
dat = yf.Ticker(tkr, session=self.session)
tz_exchange = dat.fast_info["timezone"]
tz_exchange = ticker.fast_info["timezone"]

data_cols = ["Low", "High", "Open", "Close", "Adj Close"]
df = _pd.DataFrame(data={"Open": [470.5, 473.5, 474.5, 470],
Expand All @@ -441,7 +442,7 @@ def test_repair_100x_weekly(self):

# Run test

df_repaired = dat._fix_unit_mixups(df_bad, "1wk", tz_exchange, prepost=False)
df_repaired = ticker._fix_unit_mixups(df_bad, "1wk", tz_exchange, prepost=False)

# First test - no errors left
for c in data_cols:
Expand All @@ -468,8 +469,9 @@ def test_repair_100x_weekly_preSplit(self):
# PNL.L has a stock-split in 2022. Sometimes requesting data before 2022 is not split-adjusted.

tkr = "PNL.L"
dat = yf.Ticker(tkr, session=self.session)
tz_exchange = dat.fast_info["timezone"]

ticker = yf.Ticker(tkr, session=self.session)
tz_exchange = ticker.fast_info["timezone"]

data_cols = ["Low", "High", "Open", "Close", "Adj Close"]
df = _pd.DataFrame(data={"Open": [400, 398, 392.5, 417],
Expand All @@ -496,7 +498,7 @@ def test_repair_100x_weekly_preSplit(self):
df.index = df.index.tz_localize(tz_exchange)
df_bad.index = df_bad.index.tz_localize(tz_exchange)

df_repaired = dat._fix_unit_mixups(df_bad, "1wk", tz_exchange, prepost=False)
df_repaired = ticker._fix_unit_mixups(df_bad, "1wk", tz_exchange, prepost=False)

# First test - no errors left
for c in data_cols:
Expand All @@ -523,8 +525,8 @@ def test_repair_100x_weekly_preSplit(self):

def test_repair_100x_daily(self):
tkr = "PNL.L"
dat = yf.Ticker(tkr, session=self.session)
tz_exchange = dat.fast_info["timezone"]
ticker = yf.Ticker(tkr, session=self.session)
tz_exchange = ticker.fast_info["timezone"]

data_cols = ["Low", "High", "Open", "Close", "Adj Close"]
df = _pd.DataFrame(data={"Open": [478, 476, 476, 472],
Expand All @@ -546,7 +548,7 @@ def test_repair_100x_daily(self):
df.index = df.index.tz_localize(tz_exchange)
df_bad.index = df_bad.index.tz_localize(tz_exchange)

df_repaired = dat._fix_unit_mixups(df_bad, "1d", tz_exchange, prepost=False)
df_repaired = ticker._fix_unit_mixups(df_bad, "1d", tz_exchange, prepost=False)

# First test - no errors left
for c in data_cols:
Expand All @@ -565,8 +567,9 @@ def test_repair_100x_daily(self):

def test_repair_zeroes_daily(self):
tkr = "BBIL.L"
dat = yf.Ticker(tkr, session=self.session)
tz_exchange = dat.fast_info["timezone"]

ticker = yf.Ticker(tkr, session=self.session)
tz_exchange = ticker.fast_info["timezone"]

df_bad = _pd.DataFrame(data={"Open": [0, 102.04, 102.04],
"High": [0, 102.1, 102.11],
Expand All @@ -581,7 +584,7 @@ def test_repair_zeroes_daily(self):
df_bad.index.name = "Date"
df_bad.index = df_bad.index.tz_localize(tz_exchange)

repaired_df = dat._fix_zeroes(df_bad, "1d", tz_exchange, prepost=False)
repaired_df = ticker._fix_zeroes(df_bad, "1d", tz_exchange, prepost=False)

correct_df = df_bad.copy()
correct_df.loc["2022-11-01", "Open"] = 102.080002
Expand All @@ -592,10 +595,10 @@ def test_repair_zeroes_daily(self):

def test_repair_zeroes_hourly(self):
tkr = "INTC"
dat = yf.Ticker(tkr, session=self.session)
tz_exchange = dat.fast_info["timezone"]
ticker = yf.Ticker(tkr, session=self.session)
tz_exchange = ticker.fast_info["timezone"]

correct_df = dat.history(period="1wk", interval="1h", auto_adjust=False, repair=True)
correct_df = ticker.history(period="1wk", interval="1h", auto_adjust=False, repair=True)

df_bad = correct_df.copy()
bad_idx = correct_df.index[10]
Expand All @@ -606,7 +609,7 @@ def test_repair_zeroes_hourly(self):
df_bad.loc[bad_idx, "Adj Close"] = _np.nan
df_bad.loc[bad_idx, "Volume"] = 0

repaired_df = dat._fix_zeroes(df_bad, "1h", tz_exchange, prepost=False)
repaired_df = ticker._fix_zeroes(df_bad, "1h", tz_exchange, prepost=False)

for c in ["Open", "Low", "High", "Close"]:
try:
Expand Down
Loading