Skip to content

Commit

Permalink
Merge pull request #6937 from readthedocs/mkdocs-search
Browse files Browse the repository at this point in the history
Add support for Mkdocs search
  • Loading branch information
stsewd authored Apr 29, 2020
2 parents 69f3eba + da184f5 commit 9802ad0
Show file tree
Hide file tree
Showing 17 changed files with 330 additions and 30 deletions.
6 changes: 6 additions & 0 deletions readthedocs/builds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@
MEDIA_TYPES,
PRIVACY_CHOICES,
SPHINX,
SPHINX_HTMLDIR,
SPHINX_SINGLEHTML,
)
from readthedocs.projects.models import APIProject, Project
from readthedocs.projects.version_handling import determine_stable_version
Expand Down Expand Up @@ -368,6 +370,10 @@ def supports_wipe(self):
"""Return True if version is not external."""
return not self.type == EXTERNAL

@property
def is_sphinx_type(self):
return self.documentation_type in {SPHINX, SPHINX_HTMLDIR, SPHINX_SINGLEHTML}

def get_subdomain_url(self):
external = self.type == EXTERNAL
return self.project.get_docs_url(
Expand Down
9 changes: 2 additions & 7 deletions readthedocs/doc_builder/backends/mkdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
import yaml
from django.conf import settings
from django.template import loader as template_loader
from readthedocs.projects.constants import MKDOCS_HTML, MKDOCS

from readthedocs.doc_builder.base import BaseBuilder
from readthedocs.doc_builder.exceptions import MkDocsYAMLParseError
from readthedocs.projects.constants import MKDOCS, MKDOCS_HTML
from readthedocs.projects.models import Feature


Expand Down Expand Up @@ -314,17 +314,12 @@ def get_theme_name(self, mkdocs_config):


class MkdocsHTML(BaseMkdocs):

type = 'mkdocs'
builder = 'build'
build_dir = '_build/html'


class MkdocsJSON(BaseMkdocs):
type = 'mkdocs_json'
builder = 'json'
build_dir = '_build/json'


class SafeLoaderIgnoreUnknown(yaml.SafeLoader): # pylint: disable=too-many-ancestors

"""
Expand Down
3 changes: 0 additions & 3 deletions readthedocs/doc_builder/loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# -*- coding: utf-8 -*-

"""Lookup tables for builders and backends."""
from importlib import import_module

Expand All @@ -21,7 +19,6 @@
'sphinx_singlehtmllocalmedia': sphinx.LocalMediaBuilder,
# Other markup
'mkdocs': mkdocs.MkdocsHTML,
'mkdocs_json': mkdocs.MkdocsJSON,
}


Expand Down
50 changes: 48 additions & 2 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
validate_repository_url,
)
from readthedocs.projects.version_handling import determine_stable_version
from readthedocs.search.parse_json import process_file
from readthedocs.search.parse_json import process_file, process_mkdocs_index_file
from readthedocs.vcs_support.backends import backend_cls
from readthedocs.vcs_support.utils import Lock, NonBlockingLock

Expand Down Expand Up @@ -1330,7 +1330,7 @@ class Meta:

objects = HTMLFileManager.from_queryset(HTMLFileQuerySet)()

def get_processed_json(self):
def get_processed_json_sphinx(self):
"""
Get the parsed JSON for search indexing.
Expand Down Expand Up @@ -1374,6 +1374,52 @@ def get_processed_json(self):
'domain_data': {},
}

def get_processed_json_mkdocs(self):
log.debug('Processing mkdocs index')
storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
storage_path = self.project.get_storage_path(
type_='html', version_slug=self.version.slug, include_file=False
)
try:
file_path = storage.join(storage_path, 'search/search_index.json')
if storage.exists(file_path):
index_data = process_mkdocs_index_file(file_path, page=self.path)
if index_data:
return index_data
except Exception:
log.warning(
'Unhandled exception during search processing file: %s',
file_path,
)
return {
'path': self.path,
'title': '',
'sections': [],
'domain_data': {},
}

def get_processed_json(self):
"""
Get the parsed JSON for search indexing.
Returns a dictionary with the following structure.
{
'path': 'file path',
'title': 'Title',
'sections': [
{
'id': 'section-anchor',
'title': 'Section title',
'content': 'Section content',
},
],
'domain_data': {},
}
"""
if self.version.is_sphinx_type:
return self.get_processed_json_sphinx()
return self.get_processed_json_mkdocs()

@cached_property
def processed_json(self):
return self.get_processed_json()
Expand Down
17 changes: 11 additions & 6 deletions readthedocs/projects/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,12 +1234,14 @@ def get_final_doctype(self):
return html_builder.get_final_doctype()

def build_docs_search(self):
"""Build search data."""
# Search is always run in sphinx using the rtd-sphinx-extension.
# Mkdocs has no search currently.
if self.is_type_sphinx() and self.version.type != EXTERNAL:
return True
return False
"""
Build search data.
.. note::
For MkDocs search is indexed from its ``html`` artifacts.
And in sphinx is run using the rtd-sphinx-extension.
"""
return self.is_type_sphinx() and self.version.type != EXTERNAL

def build_docs_localmedia(self):
"""Get local media files with separate build."""
Expand Down Expand Up @@ -1593,6 +1595,9 @@ def _create_intersphinx_data(version, commit, build):
:param commit: Commit that updated path
:param build: Build id
"""
if not version.is_sphinx_type:
return

storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()

html_storage_path = version.project.get_storage_path(
Expand Down
11 changes: 5 additions & 6 deletions readthedocs/search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,10 @@ class Meta:

def prepare_domains(self, html_file):
"""Prepares and returns the values for domains field."""
all_domains = []
if not html_file.version.is_sphinx_type:
return []

all_domains = []
try:
domains_qs = html_file.sphinx_domains.exclude(
domain='std',
Expand Down Expand Up @@ -172,11 +174,8 @@ def get_queryset(self):
"""Overwrite default queryset to filter certain files to index."""
queryset = super().get_queryset()

# Do not index files that belong to non sphinx project
# Also do not index certain files
queryset = queryset.internal().filter(
project__documentation_type__contains='sphinx'
)
# Do not index files from external versions
queryset = queryset.internal().all()

# TODO: Make this smarter
# This was causing issues excluding some valid user documentation pages
Expand Down
54 changes: 53 additions & 1 deletion readthedocs/search/parse_json.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Functions related to converting content into dict/JSON structures."""

import logging
from urllib.parse import urlparse
import orjson as json

from django.conf import settings
Expand Down Expand Up @@ -195,5 +196,56 @@ def parse_content(content, remove_first_line=False):
content = content[1:]

# converting newlines to ". "
content = ' '.join([text.strip() for text in content if text])
content = ' '.join(text.strip() for text in content if text)
return content


def process_mkdocs_index_file(json_storage_path, page):
"""Reads the json index file and parses it into a structured dict."""
log.debug('Processing JSON index file: %s', json_storage_path)

storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
try:
with storage.open(json_storage_path, mode='r') as f:
file_contents = f.read()
except IOError:
log.info('Unable to read file: %s', json_storage_path)
raise

data = json.loads(file_contents)
page_data = {}

for section in data.get('docs', []):
parsed_path = urlparse(section.get('location', ''))
fragment = parsed_path.fragment
path = parsed_path.path

# Some old versions of mkdocs
# index the pages as ``/page.html`` insted of ``page.html``.
path = path.lstrip('/')

if path == '' or path.endswith('/'):
path += 'index.html'

if page != path:
continue

title = HTMLParser(section.get('title')).text()
content = parse_content(
HTMLParser(section.get('text')).text()
)

if not fragment:
page_data.update({
'path': path,
'title': title,
'domain_data': {},
})
else:
page_data.setdefault('sections', []).append({
'id': fragment,
'title': title,
'content': content,
})

return page_data
6 changes: 3 additions & 3 deletions readthedocs/search/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .dummy_data import ALL_PROJECTS, PROJECT_DATA_FILES


@pytest.fixture()
@pytest.fixture
def es_index():
call_command('search_index', '--delete', '-f')
call_command('search_index', '--create')
Expand All @@ -23,7 +23,7 @@ def es_index():
call_command('search_index', '--delete', '-f')


@pytest.fixture(autouse=True)
@pytest.fixture
def all_projects(es_index, mock_processed_json, db, settings):
settings.ELASTICSEARCH_DSL_AUTOSYNC = True
projects_list = []
Expand Down Expand Up @@ -95,7 +95,7 @@ def get_dummy_processed_json(instance):
return json.load(f)


@pytest.fixture(autouse=True)
@pytest.fixture
def mock_processed_json(mocker):
mocked_function = mocker.patch.object(HTMLFile, 'get_processed_json', autospec=True)
mocked_function.side_effect = get_dummy_processed_json
31 changes: 31 additions & 0 deletions readthedocs/search/tests/data/mkdocs/in/search_index.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"config": {
"lang": [
"en"
],
"prebuild_index": false,
"separator": "[\\s\\-]+"
},
"docs": [
{
"location": "",
"text": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs.",
"title": "Read the Docs MkDocs Test Project"
},
{
"location": "#read-the-docs-mkdocs-test-project",
"text": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs.",
"title": "Read the Docs MkDocs Test Project"
},
{
"location": "versions/",
"text": "Versions & Themes There are a number of versions and themes for mkdocs.",
"title": "Versions & Themes"
},
{
"location": "versions/#versions-themes",
"text": "Versions & Themes There are a number of versions and themes for mkdocs.",
"title": "Versions & Themes"
}
]
}
24 changes: 24 additions & 0 deletions readthedocs/search/tests/data/mkdocs/in/search_index_old.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"docs": [
{
"location": "/",
"text": "Read the Docs MkDocs Test Project\n\n\nThis is a test of \nMkDocs\n as it appears on \nRead the Docs\n.",
"title": "Read the Docs MkDocs Test Project"
},
{
"location": "/#read-the-docs-mkdocs-test-project",
"text": "Read the Docs MkDocs Test Project\n\n\nThis is a test of \nMkDocs\n as it appears on \nRead the Docs\n.",
"title": "Read the Docs MkDocs Test Project"
},
{
"location": "/versions/",
"text": "Versions & Themes\n\n\nThere are a number of versions and themes for mkdocs.",
"title": "Versions & Themes"
},
{
"location": "/versions/#versions-themes",
"text": "Versions & Themes\n\n\nThere are a number of versions and themes for mkdocs.",
"title": "Versions & Themes"
}
]
}
26 changes: 26 additions & 0 deletions readthedocs/search/tests/data/mkdocs/out/search_index.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[
{
"title": "Read the Docs MkDocs Test Project",
"path": "index.html",
"sections": [
{
"id": "read-the-docs-mkdocs-test-project",
"title": "Read the Docs MkDocs Test Project",
"content": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs."
}
],
"domain_data": {}
},
{
"title": "Versions & Themes",
"path": "versions/index.html",
"sections": [
{
"id": "versions-themes",
"title": "Versions & Themes",
"content": "Versions & Themes There are a number of versions and themes for mkdocs."
}
],
"domain_data": {}
}
]
26 changes: 26 additions & 0 deletions readthedocs/search/tests/data/mkdocs/out/search_index_old.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[
{
"title": "Read the Docs MkDocs Test Project",
"path": "index.html",
"sections": [
{
"id": "read-the-docs-mkdocs-test-project",
"title": "Read the Docs MkDocs Test Project",
"content": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs ."
}
],
"domain_data": {}
},
{
"title": "Versions & Themes",
"path": "versions/index.html",
"sections": [
{
"id": "versions-themes",
"title": "Versions & Themes",
"content": "Versions & Themes There are a number of versions and themes for mkdocs."
}
],
"domain_data": {}
}
]
Loading

0 comments on commit 9802ad0

Please sign in to comment.