Skip to content

Commit

Permalink
Feat: Add Indexing for quicker search
Browse files Browse the repository at this point in the history
fixes: #13
  • Loading branch information
reglim committed Oct 31, 2022
1 parent 41f3887 commit ada0d9a
Show file tree
Hide file tree
Showing 12 changed files with 985 additions and 143 deletions.
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,30 @@ The simplest way is to build and run the docker container,
you can optionally use volumes to persist state:

```sh
# run container in background and persist data (docs, nginx configs and tokens database)
# run container in background and persist data (docs, nginx configs and tokens database as well as the content index)
# use 'ghcr.io/docat-org/docat:unstable' to get the latest changes
mkdir -p docat-run/db && touch docat-run/db/db.json
mkdir -p docat-run/db && touch docat-run/db/db.json && touch docat-run/db/index.json
docker run \
--detach \
--volume $PWD/docat-run/doc:/var/docat/doc/ \
--volume $PWD/docat-run/db/db.json:/app/docat/db.json \
--volume $PWD/docat-run/db/index.json:/app/docat/index.json \
--publish 8000:80 \
ghcr.io/docat-org/docat
```

*Alternative:* Mount a dedicated directory to host `db.json` :
*Alternative:* Mount a dedicated directory to host `db.json` and `index.json`:

```sh
# run container in background and persist data (docs, nginx configs and tokens database)
# run container in background and persist data (docs, nginx configs and tokens database as well as the content index)
# use 'ghcr.io/docat-org/docat:unstable' to get the latest changes
mkdir -p docat-run/db && touch docat-run/db/db.json
mkdir -p docat-run/db && touch docat-run/db/db.json && touch docat-run/db/index.json
docker run \
--detach \
--volume $PWD/docat-run/doc:/var/docat/doc/ \
--volume $PWD/docat-run/db:/var/docat/db/ \
--env DOCAT_DB_PATH=/var/docat/db/db.json
--env DOCAT_INDEX_PATH=/var/docat/db/index.json
--publish 8000:80 \
ghcr.io/docat-org/docat
```
Expand Down
1 change: 1 addition & 0 deletions docat/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ upload
.tox
.coverage
db.json
index.json
.python-version
235 changes: 103 additions & 132 deletions docat/docat/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,42 @@
import os
import secrets
import shutil
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Tuple
from typing import List, Optional, Tuple

import magic
from bs4 import BeautifulSoup
from bs4.element import Comment
from fastapi import Depends, FastAPI, File, Header, Response, UploadFile, status
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
from starlette.responses import JSONResponse
from tinydb import Query, TinyDB

from docat.utils import DB_PATH, UPLOAD_FOLDER, calculate_token, create_symlink, extract_archive, remove_docs
from docat.models import (
ApiResponse,
ClaimResponse,
ProjectDetailResponse,
ProjectsResponse,
SearchResponse,
SearchResultFile,
SearchResultProject,
SearchResultVersion,
TokenStatus,
)
from docat.utils import (
DB_PATH,
INDEX_PATH,
UPLOAD_FOLDER,
calculate_token,
create_symlink,
extract_archive,
get_all_projects,
get_project_details,
index_all_projects,
remove_docs,
remove_file_index_from_db,
remove_version_from_version_index,
update_file_index_for_project_version,
update_version_index_for_project,
)

#: Holds the FastAPI application
app = FastAPI(
Expand All @@ -36,6 +58,8 @@
#: Holds an instance to the TinyDB
DOCAT_DB_PATH = os.getenv("DOCAT_DB_PATH", DB_PATH)
db = TinyDB(DOCAT_DB_PATH)
# Holds the path to the TinyDB instance for Indexing of uploaded content
DOCAT_INDEX_PATH = os.getenv("DOCAT_INDEX_PATH", INDEX_PATH)
#: Holds the static base path where the uploaded documentation artifacts are stored
DOCAT_UPLOAD_FOLDER = Path(os.getenv("DOCAT_DOC_PATH", UPLOAD_FOLDER))

Expand All @@ -45,74 +69,18 @@ def get_db():
return db


@dataclass(frozen=True)
class TokenStatus:
valid: bool
reason: Optional[str] = None


class ApiResponse(BaseModel):
message: str


class ClaimResponse(ApiResponse):
token: str


class ProjectsResponse(BaseModel):
projects: list[str]


class ProjectVersion(BaseModel):
name: str
tags: list[str]


class ProjectDetailResponse(BaseModel):
name: str
versions: list[ProjectVersion]


class SearchResultProject(BaseModel):
name: str


class SearchResultVersion(BaseModel):
project: str
version: str


class SearchResultFile(BaseModel):
project: str
version: str
path: str


class SearchResponse(BaseModel):
projects: list[SearchResultProject]
versions: list[SearchResultVersion]
files: list[SearchResultFile]
@app.post("/api/index/update", response_model=ApiResponse, status_code=status.HTTP_200_OK)
@app.post("/api/index/update/", response_model=ApiResponse, status_code=status.HTTP_200_OK)
def update_index():
index_all_projects(DOCAT_UPLOAD_FOLDER, DOCAT_INDEX_PATH)
return ApiResponse(message="Successfully updated search index")


@app.get("/api/projects", response_model=ProjectsResponse, status_code=status.HTTP_200_OK)
def get_projects():
if not DOCAT_UPLOAD_FOLDER.exists():
return ProjectsResponse(projects=[])

def has_not_hidden_versions(project):
path = DOCAT_UPLOAD_FOLDER / project
return any(
(path / version).is_dir() and not (path / version / ".hidden").exists() for version in (DOCAT_UPLOAD_FOLDER / project).iterdir()
)

return ProjectsResponse(
projects=list(
filter(
has_not_hidden_versions,
[str(project.relative_to(DOCAT_UPLOAD_FOLDER)) for project in DOCAT_UPLOAD_FOLDER.iterdir() if project.is_dir()],
)
)
)
return get_all_projects(DOCAT_UPLOAD_FOLDER)


@app.get(
Expand All @@ -128,97 +96,84 @@ def has_not_hidden_versions(project):
responses={status.HTTP_404_NOT_FOUND: {"model": ApiResponse}},
)
def get_project(project):
docs_folder = DOCAT_UPLOAD_FOLDER / project
if not docs_folder.exists():
details = get_project_details(DOCAT_UPLOAD_FOLDER, project)

if not details:
return JSONResponse(status_code=status.HTTP_404_NOT_FOUND, content={"message": f"Project {project} does not exist"})

tags = [x for x in docs_folder.iterdir() if x.is_dir() and x.is_symlink()]

return ProjectDetailResponse(
name=project,
versions=sorted(
[
ProjectVersion(
name=str(x.relative_to(docs_folder)),
tags=[str(t.relative_to(docs_folder)) for t in tags if t.resolve() == x],
)
for x in docs_folder.iterdir()
if x.is_dir() and not x.is_symlink() and not (docs_folder / x.name / ".hidden").exists()
],
key=lambda k: k.name,
reverse=True,
),
)
return details


@app.get("/api/search", response_model=SearchResponse, status_code=status.HTTP_200_OK)
@app.get("/api/search/", response_model=SearchResponse, status_code=status.HTTP_200_OK)
def search(query: str):
query = query.lower()
found_projects: list[SearchResultProject] = list()
found_versions: list[SearchResultVersion] = list()
found_files: list[SearchResultFile] = list()
found_projects: List[SearchResultProject] = list()
found_versions: List[SearchResultVersion] = list()
found_files: List[SearchResultFile] = list()

all_projects = get_projects().projects
all_versions: list[Tuple[str, ProjectVersion]] = list()
index_db = TinyDB(DOCAT_INDEX_PATH)
project_table = index_db.table("projects")
projects = project_table.all()
all_versions: List[Tuple] = list()

# Collect all projects that contain the query
for project in all_projects:
project_details = get_project(project)
for project in projects:
name = project.get("name")
versions = project.get("versions")

if not name or not versions:
continue

all_versions += ((project, v) for v in project_details.versions)
all_versions += ((name, version) for version in versions)

if query in project:
project_res = SearchResultProject(name=project)
if query in name.lower():
project_res = SearchResultProject(name=name)
found_projects.append(project_res)

# Order by occurences of the query
found_projects = sorted(found_projects, key=lambda x: x.name.count(query), reverse=True)

# Collect all versions and tags that contain the query
for (project, version) in all_versions:
if query in version.name:
version_res = SearchResultVersion(version=version.name, project=project)
version_name = version.get("name")
version_tags = version.get("tags")

if query in version_name.lower():
version_res = SearchResultVersion(project=project, version=version_name)
found_versions.append(version_res)

for tag in version.tags:
for tag in version_tags:
if query in tag:
tag_res = SearchResultVersion(version=tag, project=project)
found_versions.append(tag_res)

# Collect all files whose name contains the query or whose content contains the query
for (project, version) in all_versions:
docs_folder = DOCAT_UPLOAD_FOLDER / project / version.name
for file in docs_folder.rglob("*"):
if not file.is_file():
continue
# Order by occurences of the query
found_versions = sorted(found_versions, key=lambda x: x.version.count(query), reverse=True)

if query in file.name:
file_res = SearchResultFile(path=str(file.relative_to(docs_folder)), version=version.name, project=project)
found_files.append(file_res)
continue

# don't check contents of non-html files
if not file.name.endswith(".html"):
continue
# Collect all files whose name contains the query or whose content contains the query
files_table = index_db.table("files")
files = files_table.all()

def html_tag_visible(element):
if element.parent.name in ["style", "script", "head", "title", "meta", "[document]"]:
return False
for file in files:
file_content = file.get("content")
file_path_str = file.get("path")
file_project = file.get("project")
file_project_version = file.get("version")

if isinstance(element, Comment):
return False
if file_content is None or not file_path_str or not file_project or not file_project_version:
continue

return True
file_path = Path(file_path_str)

with open(file, "r") as f:
file_content = f.read()
soup = BeautifulSoup(file_content, "html.parser")
text_content = soup.findAll(text=True)
visible_text_content = filter(html_tag_visible, text_content)
all_text = " ".join(t.strip() for t in visible_text_content).lower()
if query in file_path.name.lower():
file_res = SearchResultFile(project=file_project, version=file_project_version, path=file_path_str)
found_files.append(file_res)
continue # Skip content search if the file name already matches

if query in all_text:
file_res = SearchResultFile(path=str(file.relative_to(docs_folder)), version=version.name, project=project)
found_files.append(file_res)
continue
if file_path.suffix == ".html" and query in file_content.lower():
file_res = SearchResultFile(project=file_project, version=file_project_version, path=file_path_str)
found_files.append(file_res)

return SearchResponse(projects=found_projects, versions=found_versions, files=found_files)

Expand Down Expand Up @@ -371,6 +326,8 @@ def upload(
shutil.copyfileobj(file.file, buffer)

extract_archive(target_file, base_path)
update_version_index_for_project(DOCAT_UPLOAD_FOLDER, DOCAT_INDEX_PATH, project)
update_file_index_for_project_version(DOCAT_UPLOAD_FOLDER, DOCAT_INDEX_PATH, project, version)
return ApiResponse(message="File successfully uploaded")


Expand All @@ -385,6 +342,7 @@ def tag(project: str, version: str, new_tag: str, response: Response):
return ApiResponse(message=f"Version {version} not found")

if create_symlink(version, destination):
update_version_index_for_project(DOCAT_UPLOAD_FOLDER, DOCAT_INDEX_PATH, project)
return ApiResponse(message=f"Tag {new_tag} -> {version} successfully created")
else:
response.status_code = status.HTTP_409_CONFLICT
Expand Down Expand Up @@ -439,8 +397,19 @@ def rename(project: str, new_project_name: str, response: Response, docat_api_ke

# update the claim to the new project name
Project = Query()
table = db.table("claims")
table.update({"name": new_project_name}, Project.name == project)
claims_table = db.table("claims")
claims_table.update({"name": new_project_name}, Project.name == project)

# update the version index to the new project name
index_db = TinyDB(DOCAT_INDEX_PATH)
Project = Query()
project_table = index_db.table("projects")
project_table.update({"name": new_project_name}, Project.name == project)

# update the file index to the new project name
File = Query()
file_table = index_db.table("files")
file_table.update({"project": new_project_name}, File.project == project)

os.rename(project_base_path, new_project_base_path)

Expand All @@ -458,6 +427,8 @@ def delete(project: str, version: str, response: Response, docat_api_key: str =
response.status_code = status.HTTP_404_NOT_FOUND
return ApiResponse(message=message)
else:
remove_version_from_version_index(DOCAT_INDEX_PATH, project, version)
remove_file_index_from_db(DOCAT_INDEX_PATH, project, version)
return ApiResponse(message=f"Successfully deleted version '{version}'")
else:
response.status_code = status.HTTP_401_UNAUTHORIZED
Expand Down
Loading

0 comments on commit ada0d9a

Please sign in to comment.