Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion litellm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@
)
blog_posts_url: str = os.getenv(
"LITELLM_BLOG_POSTS_URL",
"https://raw.githubusercontent.com/BerriAI/litellm/main/litellm/blog_posts.json",
"https://docs.litellm.ai/blog/rss.xml",
)
anthropic_beta_headers_url: str = os.getenv(
"LITELLM_ANTHROPIC_BETA_HEADERS_URL",
Expand Down
84 changes: 61 additions & 23 deletions litellm/litellm_core_utils/get_blog_posts.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""
Pulls the latest LiteLLM blog posts from GitHub.
Pulls the latest LiteLLM blog posts from the docs RSS feed.

Falls back to the bundled local backup on any failure.
GitHub JSON URL is configured via litellm.blog_posts_url (or LITELLM_BLOG_POSTS_URL env var).
RSS URL is configured via litellm.blog_posts_url (or LITELLM_BLOG_POSTS_URL env var).

Disable remote fetching entirely:
export LITELLM_LOCAL_BLOG_POSTS=True
Expand All @@ -11,8 +11,10 @@
import json
import os
import time
import xml.etree.ElementTree as ET
from email.utils import parsedate_to_datetime
from importlib.resources import files
from typing import Any, Dict, List, Optional
from typing import Dict, List, Optional

import httpx
from pydantic import BaseModel
Expand All @@ -37,9 +39,8 @@ class GetBlogPosts:
"""
Fetches, validates, and caches LiteLLM blog posts.

Mirrors the structure of GetModelCostMap:
- Fetches from GitHub with a 5-second timeout
- Validates the response has a non-empty ``posts`` list
- Fetches RSS feed from docs site with a 5-second timeout
- Parses the XML and extracts the latest blog post
- Caches the result in-process for BLOG_POSTS_TTL_SECONDS (1 hour)
- Falls back to the bundled local backup on any failure
"""
Expand All @@ -56,30 +57,67 @@ def load_local_blog_posts() -> List[Dict[str, str]]:
return content.get("posts", [])

@staticmethod
def fetch_remote_blog_posts(url: str, timeout: int = 5) -> dict:
def fetch_rss_feed(url: str, timeout: int = 5) -> str:
"""
Fetch blog posts JSON from a remote URL.
Fetch RSS XML from a remote URL.

Returns the parsed response. Raises on network/parse errors.
Returns the raw XML text. Raises on network errors.
"""
response = httpx.get(url, timeout=timeout)
response.raise_for_status()
return response.json()
return response.text

@staticmethod
def validate_blog_posts(data: Any) -> bool:
"""Return True if data is a dict with a non-empty ``posts`` list."""
if not isinstance(data, dict):
verbose_logger.warning(
"LiteLLM: Blog posts response is not a dict (type=%s). "
"Falling back to local backup.",
type(data).__name__,
def parse_rss_to_posts(xml_text: str, max_posts: int = 1) -> List[Dict[str, str]]:
"""
Parse RSS XML and return a list of blog post dicts.

Extracts title, description, date (YYYY-MM-DD), and url from each <item>.
"""
root = ET.fromstring(xml_text)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

xml.etree.ElementTree is vulnerable to XML entity expansion attacks

Python's own documentation explicitly states that xml.etree.ElementTree is not secure against maliciously constructed data and is vulnerable to "Billion Laughs" and "Quadratic Blowup" entity-expansion DoS attacks.

While the default URL (https://docs.litellm.ai/blog/rss.xml) is trusted, the URL is user-configurable via LITELLM_BLOG_POSTS_URL. If an operator points this to an attacker-controlled endpoint, the server can return a deeply nested entity-expansion payload that exhausts memory/CPU before the response is even fully processed.

The defusedxml library is the recommended drop-in replacement:

import defusedxml.ElementTree as ET

This single swap neutralises billion-laughs, quadratic-blowup, and external-entity attacks without any other code changes. Alternatively, you can validate the response size before parsing (e.g., reject payloads over ~1 MB).

channel = root.find("channel")
if channel is None:
raise ValueError("RSS feed missing <channel> element")

posts: List[Dict[str, str]] = []
for item in channel.findall("item"):
if len(posts) >= max_posts:
break

title_el = item.find("title")
link_el = item.find("link")
desc_el = item.find("description")
pub_date_el = item.find("pubDate")

if title_el is None or link_el is None:
continue

# Parse RFC 2822 date to YYYY-MM-DD
date_str = ""
if pub_date_el is not None and pub_date_el.text:
try:
dt = parsedate_to_datetime(pub_date_el.text)
date_str = dt.strftime("%Y-%m-%d")
except Exception:
date_str = pub_date_el.text

posts.append(
{
"title": title_el.text or "",
"description": desc_el.text or "" if desc_el is not None else "",
"date": date_str,
"url": link_el.text or "",
}
)
return False
posts = data.get("posts")

return posts

@staticmethod
def validate_blog_posts(posts: List[Dict[str, str]]) -> bool:
"""Return True if posts is a non-empty list."""
if not isinstance(posts, list) or len(posts) == 0:
verbose_logger.warning(
"LiteLLM: Blog posts response has no valid 'posts' list. "
"LiteLLM: Parsed RSS feed has no valid posts. "
"Falling back to local backup.",
)
return False
Expand All @@ -102,7 +140,8 @@ def get_blog_posts(cls, url: str) -> List[Dict[str, str]]:
return cached

try:
data = cls.fetch_remote_blog_posts(url)
xml_text = cls.fetch_rss_feed(url)
posts = cls.parse_rss_to_posts(xml_text)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

max_posts=1 silently limits output to a single post

get_blog_posts calls parse_rss_to_posts(xml_text) without overriding max_posts, so the RSS feed is always truncated to one post. If the previous blog_posts.json contained multiple posts and the UI surfaces more than one, callers will now always receive a single-item list — a silent regression in the number of blog posts shown.

The default of 1 in parse_rss_to_posts is reasonable as a conservative safeguard, but the call site should explicitly set the intended limit (or use a named constant) so the behaviour is obvious and easy to change:

Suggested change
posts = cls.parse_rss_to_posts(xml_text)
posts = cls.parse_rss_to_posts(xml_text, max_posts=10)

(Adjust the value to whatever the UI is designed to display.)

except Exception as e:
verbose_logger.warning(
"LiteLLM: Failed to fetch blog posts from %s: %s. "
Expand All @@ -112,10 +151,9 @@ def get_blog_posts(cls, url: str) -> List[Dict[str, str]]:
)
return cls.load_local_blog_posts()

if not cls.validate_blog_posts(data):
if not cls.validate_blog_posts(posts):
return cls.load_local_blog_posts()

posts = data["posts"]
cls._cached_posts = posts
cls._last_fetch_time = now
return posts
Expand Down
87 changes: 60 additions & 27 deletions tests/test_litellm/test_get_blog_posts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Tests for GetBlogPosts utility class."""
import json
import time
from unittest.mock import MagicMock, patch

Expand All @@ -13,16 +12,26 @@
get_blog_posts,
)

SAMPLE_RESPONSE = {
"posts": [
{
"title": "Test Post",
"description": "A test post.",
"date": "2026-01-01",
"url": "https://www.litellm.ai/blog/test",
}
]
}
SAMPLE_RSS = """\
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>LiteLLM Blog</title>
<item>
<title>Test Post</title>
<link>https://docs.litellm.ai/blog/test</link>
<description>A test post.</description>
<pubDate>Wed, 01 Jan 2026 10:00:00 GMT</pubDate>
</item>
<item>
<title>Second Post</title>
<link>https://docs.litellm.ai/blog/second</link>
<description>Another post.</description>
<pubDate>Tue, 31 Dec 2025 10:00:00 GMT</pubDate>
</item>
</channel>
</rss>
"""


@pytest.fixture(autouse=True)
Expand All @@ -45,26 +54,48 @@ def test_load_local_blog_posts_returns_list():
assert "url" in first


def test_validate_blog_posts_valid():
assert GetBlogPosts.validate_blog_posts(SAMPLE_RESPONSE) is True
def test_parse_rss_to_posts():
posts = GetBlogPosts.parse_rss_to_posts(SAMPLE_RSS, max_posts=1)
assert len(posts) == 1
assert posts[0]["title"] == "Test Post"
assert posts[0]["url"] == "https://docs.litellm.ai/blog/test"
assert posts[0]["description"] == "A test post."
assert posts[0]["date"] == "2026-01-01"


def test_parse_rss_to_posts_multiple():
posts = GetBlogPosts.parse_rss_to_posts(SAMPLE_RSS, max_posts=5)
assert len(posts) == 2
assert posts[1]["title"] == "Second Post"


def test_validate_blog_posts_missing_posts_key():
assert GetBlogPosts.validate_blog_posts({"other": []}) is False
def test_parse_rss_to_posts_invalid_xml():
with pytest.raises(Exception):
GetBlogPosts.parse_rss_to_posts("not xml")


def test_parse_rss_to_posts_missing_channel():
with pytest.raises(ValueError, match="missing <channel>"):
GetBlogPosts.parse_rss_to_posts("<rss></rss>")


def test_validate_blog_posts_valid():
posts = [{"title": "T", "description": "D", "date": "2026-01-01", "url": "https://x.com"}]
assert GetBlogPosts.validate_blog_posts(posts) is True


def test_validate_blog_posts_empty_list():
assert GetBlogPosts.validate_blog_posts({"posts": []}) is False
assert GetBlogPosts.validate_blog_posts([]) is False


def test_validate_blog_posts_not_dict():
assert GetBlogPosts.validate_blog_posts("not a dict") is False
def test_validate_blog_posts_not_list():
assert GetBlogPosts.validate_blog_posts("not a list") is False


def test_get_blog_posts_success():
"""Fetches from remote on first call."""
"""Fetches from RSS on first call."""
mock_response = MagicMock()
mock_response.json.return_value = SAMPLE_RESPONSE
mock_response.text = SAMPLE_RSS
mock_response.raise_for_status = MagicMock()

with patch("litellm.litellm_core_utils.get_blog_posts.httpx.get", return_value=mock_response):
Expand All @@ -86,10 +117,10 @@ def test_get_blog_posts_network_error_falls_back_to_local():
assert len(posts) > 0


def test_get_blog_posts_invalid_json_falls_back_to_local():
"""Falls back when remote returns non-dict."""
def test_get_blog_posts_invalid_xml_falls_back_to_local():
"""Falls back when remote returns invalid XML."""
mock_response = MagicMock()
mock_response.json.return_value = "not a dict"
mock_response.text = "not valid xml"
mock_response.raise_for_status = MagicMock()

with patch("litellm.litellm_core_utils.get_blog_posts.httpx.get", return_value=mock_response):
Expand All @@ -101,7 +132,8 @@ def test_get_blog_posts_invalid_json_falls_back_to_local():

def test_get_blog_posts_ttl_cache_not_refetched():
"""Within TTL window, does not re-fetch."""
GetBlogPosts._cached_posts = SAMPLE_RESPONSE["posts"]
cached = [{"title": "Cached", "description": "D", "date": "2026-01-01", "url": "https://x.com"}]
GetBlogPosts._cached_posts = cached
GetBlogPosts._last_fetch_time = time.time() # just now

call_count = 0
Expand All @@ -110,7 +142,7 @@ def mock_get(*args, **kwargs):
nonlocal call_count
call_count += 1
m = MagicMock()
m.json.return_value = SAMPLE_RESPONSE
m.text = SAMPLE_RSS
m.raise_for_status = MagicMock()
return m

Expand All @@ -123,11 +155,12 @@ def mock_get(*args, **kwargs):

def test_get_blog_posts_ttl_expired_refetches():
"""After TTL window, re-fetches from remote."""
GetBlogPosts._cached_posts = SAMPLE_RESPONSE["posts"]
cached = [{"title": "Cached", "description": "D", "date": "2026-01-01", "url": "https://x.com"}]
GetBlogPosts._cached_posts = cached
GetBlogPosts._last_fetch_time = time.time() - 7200 # 2 hours ago

mock_response = MagicMock()
mock_response.json.return_value = SAMPLE_RESPONSE
mock_response.text = SAMPLE_RSS
mock_response.raise_for_status = MagicMock()

with patch(
Expand Down
Loading