From f3eaf18f520a2188b55a2358ff7b0b22945cab62 Mon Sep 17 00:00:00 2001
From: Jiaxiang Yu <yujiaxiang2013@gmail.com>
Date: Sun, 3 Sep 2023 23:20:13 +0800
Subject: [PATCH 01/12] Add export of documentation for core functions under
 LLMEngine and AsyncLLMEngine

---
 docs/source/conf.py                  | 10 ++++++----
 docs/source/dev/async_llm_engine.rst |  7 +++++++
 docs/source/dev/engine.rst           | 13 +++++++++++++
 docs/source/dev/llm_engine.rst       |  6 ++++++
 docs/source/index.rst                | 13 +++++++++++++
 5 files changed, 45 insertions(+), 4 deletions(-)
 create mode 100644 docs/source/dev/async_llm_engine.rst
 create mode 100644 docs/source/dev/engine.rst
 create mode 100644 docs/source/dev/llm_engine.rst

diff --git a/docs/source/conf.py b/docs/source/conf.py
index d0c64cf53230..1ca0a2cb1118 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -9,10 +9,10 @@
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
+
+import os
+import sys
+sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
 
 
 # -- Project information -----------------------------------------------------
@@ -32,6 +32,8 @@
     "sphinx.ext.viewcode",
     "sphinx.ext.intersphinx",
     "sphinx_copybutton",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/docs/source/dev/async_llm_engine.rst b/docs/source/dev/async_llm_engine.rst
new file mode 100644
index 000000000000..47db1e0a401b
--- /dev/null
+++ b/docs/source/dev/async_llm_engine.rst
@@ -0,0 +1,7 @@
+
+AsyncLLMEngine
+=================================
+
+.. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine
+    :members: generate, abort
+    :show-inheritance:
diff --git a/docs/source/dev/engine.rst b/docs/source/dev/engine.rst
new file mode 100644
index 000000000000..75067486bd71
--- /dev/null
+++ b/docs/source/dev/engine.rst
@@ -0,0 +1,13 @@
+VLLM Engine
+=================================
+
+.. automodule:: vllm.engine
+.. currentmodule:: vllm.engine
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Engines
+   
+   llm_engine
+   async_llm_engine
+
diff --git a/docs/source/dev/llm_engine.rst b/docs/source/dev/llm_engine.rst
new file mode 100644
index 000000000000..1de6d7adc87c
--- /dev/null
+++ b/docs/source/dev/llm_engine.rst
@@ -0,0 +1,6 @@
+LLMEngine
+=================================
+
+.. autoclass:: vllm.engine.llm_engine.LLMEngine
+    :members: add_request, abort_request, step
+    :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index f2131cd88f41..83543654443f 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -72,3 +72,16 @@ Documentation
 
    models/supported_models
    models/adding_model
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Developer doc
+
+   dev/engine
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
\ No newline at end of file

From f16ae30a9ac4b2a84ee8c6f3d5dee2f7a44ba4b5 Mon Sep 17 00:00:00 2001
From: Jiaxiang Yu <yujiaxiang2013@gmail.com>
Date: Mon, 4 Sep 2023 03:20:49 +0800
Subject: [PATCH 02/12] add additional comments for functions related to
 LLMEngine

---
 vllm/core/scheduler.py    | 11 +++++++++++
 vllm/engine/llm_engine.py | 12 +++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 8f381add8f39..7de8a4b0b4ea 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -88,6 +88,17 @@ def add_seq_group(self, seq_group: SequenceGroup) -> None:
         self.waiting.append(seq_group)
 
     def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
+        """Aborts a sequence group with the given ID.
+
+        Check if the sequence group with the given ID is present in any of the state queue.
+        If present, remove the sequence group from the state queue. 
+            Also, if any of the sequences in the sequence group is not finished, 
+                free the sequence with status :class:`~vllm.SequenceStatus.FINISHED_ABORTED`.
+        Otherwise, do nothing.
+
+        Args:
+            request_id: The ID of the sequence group to abort.
+        """
         if isinstance(request_id, str):
             request_id = (request_id, )
         request_ids = set(request_id)
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 74a8905a916d..a50875387bfc 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -242,7 +242,7 @@ def add_request(
         prompt_token_ids: Optional[List[int]] = None,
         arrival_time: Optional[float] = None,
     ) -> None:
-        """Add a request to the engine's request pool.
+        r"""Add a request to the engine's request pool.
 
         The request is added to the request pool and will be processed by the
         scheduler as `engine.step()` is called. The exact scheduling policy is
@@ -257,6 +257,13 @@ def add_request(
                 use the tokenizer to convert the prompts to token IDs.
             arrival_time: The arrival time of the request. If None, we use
                 the current monotonic time.
+
+        Details:
+            - Set arrival_time to the current time if it is None.
+            - Set prompt_token_ids to the encoded prompt if it is None.
+            - Create `best_of` number of :class:`~vllm.Sequence` objects.
+            - Create a :class:`~vllm.SequenceGroup` object from the list of :class:`~vllm.Sequence`.
+            - Add the :class:`~vllm.SequenceGroup` object to the scheduler.
         """
         if arrival_time is None:
             arrival_time = time.monotonic()
@@ -281,6 +288,9 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
 
         Args:
             request_id: The ID(s) of the request to abort.
+
+        Details:
+            - Refer to the :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`.
         """
         self.scheduler.abort_seq_group(request_id)
 

From 8d375e9cb1a61a79e65bf9595740e3299a0c6f5c Mon Sep 17 00:00:00 2001
From: Jiaxiang Yu <yujiaxiang2013@gmail.com>
Date: Mon, 4 Sep 2023 04:51:01 +0800
Subject: [PATCH 03/12] add additional comments for functions related to
 AsyncLLMEngine

---
 vllm/engine/async_llm_engine.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index aa7775124224..3d02ee7d2796 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -415,6 +415,18 @@ async def generate(
         Yields:
             The output `RequestOutput` objects from the LLMEngine for the
             request.
+        
+        Details:
+            - Create an event to notify us that there is new request to the engine.
+            - Add the request into the engine's waiting queue via :meth:`~vllm.engine.llm_engine.LLMEngine.add_request`.
+            - Create a loop that keeps processing 
+               If the engine is not running, kick it off with :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` 
+               Else, Wait for new output via :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.request_event.wait`.
+               If timed out, continue.
+               
+               Upon receiving a new result, decode and yield.
+
+               If the request is finished, release and reset necessary resources. 
         """
         # Preprocess the request.
         # This should not be used for logging, as it is monotonic time.

From 0eeb8aa8cccafcaef3c1088a4aac568e0cb528aa Mon Sep 17 00:00:00 2001
From: Jiaxiang Yu <yujiaxiang2013@gmail.com>
Date: Mon, 4 Sep 2023 12:40:40 +0800
Subject: [PATCH 04/12] add examples for add_request, abort_request and step

---
 vllm/engine/llm_engine.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index a50875387bfc..eba4ec207bb4 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -264,6 +264,17 @@ def add_request(
             - Create `best_of` number of :class:`~vllm.Sequence` objects.
             - Create a :class:`~vllm.SequenceGroup` object from the list of :class:`~vllm.Sequence`.
             - Add the :class:`~vllm.SequenceGroup` object to the scheduler.
+
+        Example::
+            >>> # initialize engine
+            >>> engine = LLMEngine.from_engine_args(engine_args)
+            >>> # set request arguments
+            >>> example_prompt = "Who is the president of the United States?"
+            >>> sampling_params = SamplingParams(temperature=0.0)
+            >>> request_id = 0
+            >>> # add the request to the engine
+            >>> engine.add_request(str(request_id), example_prompt, SamplingParams(temperature=0.0))
+            >>> # continue the request processing
         """
         if arrival_time is None:
             arrival_time = time.monotonic()
@@ -291,6 +302,12 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
 
         Details:
             - Refer to the :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`.
+
+        Example::
+            >>> # initialize engine and add a request with request_id
+            >>> request_id = str(0)
+            >>> # abort the request
+            >>> engine.abort_request(request_id)
         """
         self.scheduler.abort_seq_group(request_id)
 
@@ -557,6 +574,26 @@ def step(self) -> List[RequestOutput]:
         token blocks to be swapped in/out/copy. Then, it executes the model
         and updates the scheduler with the model outputs. Finally, it decodes
         the sequences and returns the newly generated results.
+
+        Example::
+            >>> # Please see the example/ folder for more detailed examples.
+            >>> # initialize engine and request arguments
+            >>> engine = LLMEngine.from_engine_args(engine_args)
+            >>> example_inputs = [(0, "Who is the president of the United States?", SamplingParams(temperature=0.0))]
+            >>> # Start the engine with an event loop
+            >>> while True:
+            >>>     if example_inputs:
+            >>>         request_id, example_prompt, sampling_params = example_inputs.pop(0)
+            >>>         engine.add_request(str(request_id), example_prompt, sampling_params)
+            >>>
+            >>>     # continue the request processing
+            >>>     request_outputs = engine.step()
+            >>>     for request_output in request_outputs:
+            >>>         if request_output.finished:
+            >>>             # return or show the request output
+            >>>
+            >>>     if not (engine.has_unfinished_requests() or example_inputs):
+            >>>         break
         """
         seq_group_metadata_list, scheduler_outputs, ignored = self._schedule()
         if scheduler_outputs.is_empty():

From 44b8a64a8efb5f61657ec323769c8d6ee912df30 Mon Sep 17 00:00:00 2001
From: Jiaxiang Yu <yujiaxiang2013@gmail.com>
Date: Mon, 4 Sep 2023 12:41:01 +0800
Subject: [PATCH 05/12] add examples for generate and abort

---
 vllm/engine/async_llm_engine.py | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 3d02ee7d2796..aa3918a071f6 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -426,7 +426,33 @@ async def generate(
                
                Upon receiving a new result, decode and yield.
 
-               If the request is finished, release and reset necessary resources. 
+               If the request is finished, release and reset necessary resources.
+
+        Example::
+            >>> # Please refer to the code in entrypoints/api_server.py for a complete example.
+            >>> # initialize the engine and the example input
+            >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
+            >>> example_input = {
+            >>>     "prompt": "Who is the president of the United States?",
+            >>>     "stream": False, # assume the non-streaming case
+            >>>     "temperature": 0.0,
+            >>>     "request_id": 0,
+            >>> }
+            >>>
+            >>> # start the generation
+            >>> results_generator = engine.generate(example_input["prompt"], SamplingParams(temperature=example_input["temperature"]), example_input["request_id"])
+            >>>
+            >>> # get the results
+            >>> final_output = None
+            >>> async for request_output in results_generator:
+            >>>     if await request.is_disconnected():
+            >>>         await engine.abort(request_id) # Abort the request if the client disconnects.
+            >>>         # Return or raise an error
+            >>>         ...
+            >>>     final_output = request_output
+            >>>
+            >>> # Process and return the final output
+            >>> ...
         """
         # Preprocess the request.
         # This should not be used for logging, as it is monotonic time.
@@ -455,6 +481,9 @@ async def abort(self, request_id: str) -> None:
 
         Args:
             request_id: The unique id of the request.
+
+        Example::
+            Please see the example in :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.generate`.
         """
         if not self.is_running:
             raise AsyncEngineDeadError(

From cfbe20490422837812136ba25711b9f83d167048 Mon Sep 17 00:00:00 2001
From: Jiaxiang Yu <yujiaxiang2013@gmail.com>
Date: Mon, 11 Sep 2023 04:55:00 +0800
Subject: [PATCH 06/12] add documentation for _init_cache

---
 .../dev/{ => engine}/async_llm_engine.rst      |  0
 .../{engine.rst => engine/engine_index.rst}    |  0
 docs/source/dev/{ => engine}/llm_engine.rst    |  2 +-
 docs/source/index.rst                          |  2 +-
 vllm/engine/llm_engine.py                      | 18 +++++++++++++++---
 vllm/worker/worker.py                          |  8 ++++++++
 6 files changed, 25 insertions(+), 5 deletions(-)
 rename docs/source/dev/{ => engine}/async_llm_engine.rst (100%)
 rename docs/source/dev/{engine.rst => engine/engine_index.rst} (100%)
 rename docs/source/dev/{ => engine}/llm_engine.rst (65%)

diff --git a/docs/source/dev/async_llm_engine.rst b/docs/source/dev/engine/async_llm_engine.rst
similarity index 100%
rename from docs/source/dev/async_llm_engine.rst
rename to docs/source/dev/engine/async_llm_engine.rst
diff --git a/docs/source/dev/engine.rst b/docs/source/dev/engine/engine_index.rst
similarity index 100%
rename from docs/source/dev/engine.rst
rename to docs/source/dev/engine/engine_index.rst
diff --git a/docs/source/dev/llm_engine.rst b/docs/source/dev/engine/llm_engine.rst
similarity index 65%
rename from docs/source/dev/llm_engine.rst
rename to docs/source/dev/engine/llm_engine.rst
index 1de6d7adc87c..b550a9b5faa6 100644
--- a/docs/source/dev/llm_engine.rst
+++ b/docs/source/dev/engine/llm_engine.rst
@@ -2,5 +2,5 @@ LLMEngine
 =================================
 
 .. autoclass:: vllm.engine.llm_engine.LLMEngine
-    :members: add_request, abort_request, step
+    :members: add_request, abort_request, step, _init_cache
     :show-inheritance:
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 83543654443f..56d432d064ed 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -77,7 +77,7 @@ Documentation
    :maxdepth: 2
    :caption: Developer doc
 
-   dev/engine
+   dev/engine/engine_index
 
 Indices and tables
 ==================
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index eba4ec207bb4..e837c9637eed 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -188,7 +188,19 @@ def _verify_args(self) -> None:
         self.cache_config.verify_with_parallel_config(self.parallel_config)
 
     def _init_cache(self) -> None:
-        """Profiles the memory usage and initializes the KV cache."""
+        """Profiles the memory usage and initializes the KV cache.
+        
+        The engine will first conduct a profiling of the existing memory usage. Then, it calculate 
+        the maximum number of GPU and CPU blocks that can be allocated with the remaining free memory.
+        Note that all available GPU memory will be considered during the calculation. More details can 
+        be found in the :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method 
+        from class :class:`~vllm.worker.Worker`.
+
+        As there may be multiple workers, we take the minimum number of blocks across all workers to ensure 
+        this can be applied to all workers.
+
+        Finally, the engine will initialize the KV cache with the calculated number of blocks.
+        """
         # Get the maximum number of blocks that can be allocated on GPU and CPU.
         num_blocks = self._run_workers(
             "profile_num_available_blocks",
@@ -242,7 +254,7 @@ def add_request(
         prompt_token_ids: Optional[List[int]] = None,
         arrival_time: Optional[float] = None,
     ) -> None:
-        r"""Add a request to the engine's request pool.
+        """Add a request to the engine's request pool.
 
         The request is added to the request pool and will be processed by the
         scheduler as `engine.step()` is called. The exact scheduling policy is
@@ -301,7 +313,7 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
             request_id: The ID(s) of the request to abort.
 
         Details:
-            - Refer to the :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`.
+            - Refer to the :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group` from class :class:`~vllm.core.scheduler.Scheduler`.
 
         Example::
             >>> # initialize engine and add a request with request_id
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 6fbc155d68d6..0484b97f209a 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -74,6 +74,14 @@ def profile_num_available_blocks(
         gpu_memory_utilization: float,
         cpu_swap_space: int,
     ) -> Tuple[int, int]:
+        """Profiles the peak memory usage of the model and returns the maximum 
+        number of GPU and CPU cache blocks that can be allocated.
+
+        Args:
+            block_size: The size of the cache block.
+            gpu_memory_utilization: The fraction of the total GPU memory to use.
+            cpu_swap_space: The size of the CPU swap space in bytes.
+        """
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
         torch.cuda.empty_cache()

From d686c2b05fa98017ffb29acf7b94fed422c193e7 Mon Sep 17 00:00:00 2001
From: Jiaxiang Yu <yujiaxiang2013@gmail.com>
Date: Mon, 11 Sep 2023 13:51:21 +0800
Subject: [PATCH 07/12] add an diagram to explain step()

---
 vllm/engine/llm_engine.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index e837c9637eed..ff12e7c9abc5 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -189,14 +189,14 @@ def _verify_args(self) -> None:
 
     def _init_cache(self) -> None:
         """Profiles the memory usage and initializes the KV cache.
-        
-        The engine will first conduct a profiling of the existing memory usage. Then, it calculate 
+
+        The engine will first conduct a profiling of the existing memory usage. Then, it calculate
         the maximum number of GPU and CPU blocks that can be allocated with the remaining free memory.
-        Note that all available GPU memory will be considered during the calculation. More details can 
-        be found in the :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method 
+        Note that all available GPU memory will be considered during the calculation. More details can
+        be found in the :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method
         from class :class:`~vllm.worker.Worker`.
 
-        As there may be multiple workers, we take the minimum number of blocks across all workers to ensure 
+        As there may be multiple workers, we take the minimum number of blocks across all workers to ensure
         this can be applied to all workers.
 
         Finally, the engine will initialize the KV cache with the calculated number of blocks.
@@ -581,11 +581,22 @@ def _process_model_outputs(
     def step(self) -> List[RequestOutput]:
         """Performs one decoding iteration and returns newly generated results.
 
-        This function performs one decoding iteration of the engine. It first
-        schedules the sequences to be executed in the next iteration and the
-        token blocks to be swapped in/out/copy. Then, it executes the model
-        and updates the scheduler with the model outputs. Finally, it decodes
-        the sequences and returns the newly generated results.
+        .. figure:: https://i.imgur.com/vOy3B90.png
+            :alt: Overview of the step function
+            :align: center
+
+            Overview of how the step function performs one decoding iteration of the engine.
+
+        - Step 1: Schedules the sequences to be executed in the next iteration and the token blocks to be swapped in/out/copy.
+
+            - Depending on the scheduling policy, sequences may be `preempted/reordered`.
+            - Sequence Group (SG) refer to a group of sequences that are generated from the same prompt.
+
+        - Step 2: Calls the workers to execute the model.
+        - Step 3: Updates the scheduler with the model outputs.
+        - Step 4: Decodes the sequences.
+        - Step 5: Stops the sequences which satisfied the stopping requirements and frees their memory.
+        - Finally, it returns the newly generated results.
 
         Example::
             >>> # Please see the example/ folder for more detailed examples.

From e7178a61b78f058b1682a8d9e66df83b695ac6fb Mon Sep 17 00:00:00 2001
From: Jiaxiang Yu <yujiaxiang2013@gmail.com>
Date: Mon, 11 Sep 2023 14:15:15 +0800
Subject: [PATCH 08/12] minor style adjusting

---
 vllm/engine/async_llm_engine.py |  4 ++--
 vllm/engine/llm_engine.py       | 33 ++++++++++++++++++++++-----------
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index aa3918a071f6..2a54e3f10708 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -428,7 +428,7 @@ async def generate(
 
                If the request is finished, release and reset necessary resources.
 
-        Example::
+        Example:
             >>> # Please refer to the code in entrypoints/api_server.py for a complete example.
             >>> # initialize the engine and the example input
             >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
@@ -482,7 +482,7 @@ async def abort(self, request_id: str) -> None:
         Args:
             request_id: The unique id of the request.
 
-        Example::
+        Example:
             Please see the example in :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.generate`.
         """
         if not self.is_running:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index ff12e7c9abc5..a98f1195bcb2 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -194,12 +194,22 @@ def _init_cache(self) -> None:
         the maximum number of GPU and CPU blocks that can be allocated with the remaining free memory.
         Note that all available GPU memory will be considered during the calculation. More details can
         be found in the :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method
+
+        The engine will first conduct a profiling of the existing memory usage. Then, it calculate
+        the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory.
+        More details can be found in the :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method
         from class :class:`~vllm.worker.Worker`.
 
         As there may be multiple workers, we take the minimum number of blocks across all workers to ensure
         this can be applied to all workers.
 
+        Afterwards, as there may be multiple workers, we take the minimum number of blocks across all workers
+        to ensure this can be applied to all of them.
+
         Finally, the engine will initialize the KV cache with the calculated number of blocks.
+
+        .. tip::
+            You may limit the usage of GPU memory by adjusting the `gpu_memory_utilization` parameters.
         """
         # Get the maximum number of blocks that can be allocated on GPU and CPU.
         num_blocks = self._run_workers(
@@ -277,7 +287,7 @@ def add_request(
             - Create a :class:`~vllm.SequenceGroup` object from the list of :class:`~vllm.Sequence`.
             - Add the :class:`~vllm.SequenceGroup` object to the scheduler.
 
-        Example::
+        Example:
             >>> # initialize engine
             >>> engine = LLMEngine.from_engine_args(engine_args)
             >>> # set request arguments
@@ -315,7 +325,7 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
         Details:
             - Refer to the :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group` from class :class:`~vllm.core.scheduler.Scheduler`.
 
-        Example::
+        Example:
             >>> # initialize engine and add a request with request_id
             >>> request_id = str(0)
             >>> # abort the request
@@ -587,18 +597,19 @@ def step(self) -> List[RequestOutput]:
 
             Overview of how the step function performs one decoding iteration of the engine.
 
-        - Step 1: Schedules the sequences to be executed in the next iteration and the token blocks to be swapped in/out/copy.
+        Details:
+            - Step 1: Schedules the sequences to be executed in the next iteration and the token blocks to be swapped in/out/copy.
 
-            - Depending on the scheduling policy, sequences may be `preempted/reordered`.
-            - Sequence Group (SG) refer to a group of sequences that are generated from the same prompt.
+                * Depending on the scheduling policy, sequences may be `preempted/reordered`.
+                * Sequence Group (SG) refer to a group of sequences that are generated from the same prompt.
 
-        - Step 2: Calls the workers to execute the model.
-        - Step 3: Updates the scheduler with the model outputs.
-        - Step 4: Decodes the sequences.
-        - Step 5: Stops the sequences which satisfied the stopping requirements and frees their memory.
-        - Finally, it returns the newly generated results.
+            - Step 2: Calls the workers to execute the model.
+            - Step 3: Updates the scheduler with the model outputs.
+            - Step 4: Decodes the sequences.
+            - Step 5: Stops the sequences which satisfied the stopping requirements and frees their memory.
+            - Finally, it returns the newly generated results.
 
-        Example::
+        Example:
             >>> # Please see the example/ folder for more detailed examples.
             >>> # initialize engine and request arguments
             >>> engine = LLMEngine.from_engine_args(engine_args)

From ee72b5c0056168534a3e5e017bd01431d2a12f41 Mon Sep 17 00:00:00 2001
From: Jiaxiang Yu <yujiaxiang2013@gmail.com>
Date: Mon, 11 Sep 2023 23:58:31 +0800
Subject: [PATCH 09/12] update documentation based on new changes in upstream
 repo

---
 docs/source/conf.py             |  3 +-
 docs/source/index.rst           |  2 +-
 vllm/core/scheduler.py          | 11 ++---
 vllm/engine/async_llm_engine.py | 41 +++++++++---------
 vllm/engine/llm_engine.py       | 73 ++++++++++++++++++++-------------
 5 files changed, 74 insertions(+), 56 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1ca0a2cb1118..695f6c380b18 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -12,8 +12,8 @@
 
 import os
 import sys
-sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
 
+sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
 
 # -- Project information -----------------------------------------------------
 
@@ -21,7 +21,6 @@
 copyright = '2023, vLLM Team'
 author = 'the vLLM Team'
 
-
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 56d432d064ed..a2826cb1cfeb 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -75,7 +75,7 @@ Documentation
 
 .. toctree::
    :maxdepth: 2
-   :caption: Developer doc
+   :caption: Developer Documentation
 
    dev/engine/engine_index
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 7de8a4b0b4ea..7a0575b52c85 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -90,14 +90,15 @@ def add_seq_group(self, seq_group: SequenceGroup) -> None:
     def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
         """Aborts a sequence group with the given ID.
 
-        Check if the sequence group with the given ID is present in any of the state queue.
-        If present, remove the sequence group from the state queue. 
-            Also, if any of the sequences in the sequence group is not finished, 
-                free the sequence with status :class:`~vllm.SequenceStatus.FINISHED_ABORTED`.
+        Check if the sequence group with the given ID 
+            is present in any of the state queue.
+        If present, remove the sequence group from the state queue.
+            Also, if any of the sequences in the sequence group is not finished,
+                free the sequence with status `FINISHED_ABORTED`.
         Otherwise, do nothing.
 
         Args:
-            request_id: The ID of the sequence group to abort.
+            request_id: The ID(s) of the sequence group to abort.
         """
         if isinstance(request_id, str):
             request_id = (request_id, )
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 2a54e3f10708..2463eedcf137 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -250,7 +250,8 @@ class AsyncLLMEngine:
         log_requests: Whether to log the requests.
         start_engine_loop: If True, the background task to run the engine
             will be automatically started in the generate call.
-        *args, *kwargs: Arguments for LLMEngine.
+        *args: Arguments for LLMEngine.
+        *kwargs: Arguments for LLMEngine.
     """
 
     _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
@@ -415,38 +416,43 @@ async def generate(
         Yields:
             The output `RequestOutput` objects from the LLMEngine for the
             request.
-        
-        Details:
-            - Create an event to notify us that there is new request to the engine.
-            - Add the request into the engine's waiting queue via :meth:`~vllm.engine.llm_engine.LLMEngine.add_request`.
-            - Create a loop that keeps processing 
-               If the engine is not running, kick it off with :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` 
-               Else, Wait for new output via :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.request_event.wait`.
-               If timed out, continue.
-               
-               Upon receiving a new result, decode and yield.
 
-               If the request is finished, release and reset necessary resources.
+        Details:
+            - If the engine is not running, start the background loop,
+              which iteratively invokes
+              :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` 
+              to process the waiting requests.
+            - Add the request to the engine's `RequestTracker`.
+              On the next background loop, this request will be sent to 
+              the underlying engine.
+              Also, a corresponding `AsyncStream` will be created.
+            - Wait for the request outputs from `AsyncStream` and yield them.
 
         Example:
-            >>> # Please refer to the code in entrypoints/api_server.py for a complete example.
+            >>> # Please refer to entrypoints/api_server.py for 
+            >>> # the complete example.
+            >>>
             >>> # initialize the engine and the example input
             >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
             >>> example_input = {
-            >>>     "prompt": "Who is the president of the United States?",
+            >>>     "prompt": "What is LLM?",
             >>>     "stream": False, # assume the non-streaming case
             >>>     "temperature": 0.0,
             >>>     "request_id": 0,
             >>> }
             >>>
             >>> # start the generation
-            >>> results_generator = engine.generate(example_input["prompt"], SamplingParams(temperature=example_input["temperature"]), example_input["request_id"])
+            >>> results_generator = engine.generate(
+            >>>    example_input["prompt"], 
+            >>>    SamplingParams(temperature=example_input["temperature"]), 
+            >>>    example_input["request_id"])
             >>>
             >>> # get the results
             >>> final_output = None
             >>> async for request_output in results_generator:
             >>>     if await request.is_disconnected():
-            >>>         await engine.abort(request_id) # Abort the request if the client disconnects.
+            >>>         # Abort the request if the client disconnects.
+            >>>         await engine.abort(request_id) 
             >>>         # Return or raise an error
             >>>         ...
             >>>     final_output = request_output
@@ -481,9 +487,6 @@ async def abort(self, request_id: str) -> None:
 
         Args:
             request_id: The unique id of the request.
-
-        Example:
-            Please see the example in :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.generate`.
         """
         if not self.is_running:
             raise AsyncEngineDeadError(
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index a98f1195bcb2..30f70a90b420 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -190,26 +190,23 @@ def _verify_args(self) -> None:
     def _init_cache(self) -> None:
         """Profiles the memory usage and initializes the KV cache.
 
-        The engine will first conduct a profiling of the existing memory usage. Then, it calculate
-        the maximum number of GPU and CPU blocks that can be allocated with the remaining free memory.
-        Note that all available GPU memory will be considered during the calculation. More details can
-        be found in the :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method
-
-        The engine will first conduct a profiling of the existing memory usage. Then, it calculate
-        the maximum possible number of GPU and CPU blocks that can be allocated with the remaining free memory.
-        More details can be found in the :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of GPU and CPU blocks
+        that can be allocated with the remaining free memory.
+        More details can be found in the
+        :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method
         from class :class:`~vllm.worker.Worker`.
 
-        As there may be multiple workers, we take the minimum number of blocks across all workers to ensure
-        this can be applied to all workers.
-
-        Afterwards, as there may be multiple workers, we take the minimum number of blocks across all workers
+        Afterwards, as there may be multiple workers,
+        we take the minimum number of blocks across all workers
         to ensure this can be applied to all of them.
 
-        Finally, the engine will initialize the KV cache with the calculated number of blocks.
+        Finally, the engine will initialize the KV cache
+        with the calculated number of blocks.
 
         .. tip::
-            You may limit the usage of GPU memory by adjusting the `gpu_memory_utilization` parameters.
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameters.
         """
         # Get the maximum number of blocks that can be allocated on GPU and CPU.
         num_blocks = self._run_workers(
@@ -284,7 +281,8 @@ def add_request(
             - Set arrival_time to the current time if it is None.
             - Set prompt_token_ids to the encoded prompt if it is None.
             - Create `best_of` number of :class:`~vllm.Sequence` objects.
-            - Create a :class:`~vllm.SequenceGroup` object from the list of :class:`~vllm.Sequence`.
+            - Create a :class:`~vllm.SequenceGroup` object
+              from the list of :class:`~vllm.Sequence`.
             - Add the :class:`~vllm.SequenceGroup` object to the scheduler.
 
         Example:
@@ -294,9 +292,14 @@ def add_request(
             >>> example_prompt = "Who is the president of the United States?"
             >>> sampling_params = SamplingParams(temperature=0.0)
             >>> request_id = 0
+            >>>
             >>> # add the request to the engine
-            >>> engine.add_request(str(request_id), example_prompt, SamplingParams(temperature=0.0))
+            >>> engine.add_request(
+            >>>    str(request_id),
+            >>>    example_prompt,
+            >>>    SamplingParams(temperature=0.0))
             >>> # continue the request processing
+            >>> ...
         """
         if arrival_time is None:
             arrival_time = time.monotonic()
@@ -323,7 +326,9 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
             request_id: The ID(s) of the request to abort.
 
         Details:
-            - Refer to the :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group` from class :class:`~vllm.core.scheduler.Scheduler`.
+            - Refer to the
+              :meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
+              from class :class:`~vllm.core.scheduler.Scheduler`.
 
         Example:
             >>> # initialize engine and add a request with request_id
@@ -591,34 +596,44 @@ def _process_model_outputs(
     def step(self) -> List[RequestOutput]:
         """Performs one decoding iteration and returns newly generated results.
 
-        .. figure:: https://i.imgur.com/vOy3B90.png
+        .. figure:: https://i.imgur.com/sv2HssD.png
             :alt: Overview of the step function
             :align: center
 
-            Overview of how the step function performs one decoding iteration of the engine.
+            Overview of the step function.
 
         Details:
-            - Step 1: Schedules the sequences to be executed in the next iteration and the token blocks to be swapped in/out/copy.
+            - Step 1: Schedules the sequences to be executed in the next
+              iteration and the token blocks to be swapped in/out/copy.
 
-                * Depending on the scheduling policy, sequences may be `preempted/reordered`.
-                * Sequence Group (SG) refer to a group of sequences that are generated from the same prompt.
+                - Depending on the scheduling policy,
+                  sequences may be `preempted/reordered`.
+                - A Sequence Group (SG) refer to a group of sequences
+                  that are generated from the same prompt.
 
             - Step 2: Calls the workers to execute the model.
-            - Step 3: Updates the scheduler with the model outputs.
-            - Step 4: Decodes the sequences.
-            - Step 5: Stops the sequences which satisfied the stopping requirements and frees their memory.
-            - Finally, it returns the newly generated results.
+            - Step 3: Processes the model output. This mainly includes:
+
+                - Decodes the relevant outputs.
+                - Updates the scheduled sequence groups with model outputs
+                  based on its `sampling parameters` (`use_beam_search` or not).
+                - Frees the finished sequence groups.
+
+            - Finally, it creates and returns the newly generated results.
 
         Example:
             >>> # Please see the example/ folder for more detailed examples.
+            >>>
             >>> # initialize engine and request arguments
             >>> engine = LLMEngine.from_engine_args(engine_args)
-            >>> example_inputs = [(0, "Who is the president of the United States?", SamplingParams(temperature=0.0))]
+            >>> example_inputs = [(0, "What is LLM?",
+            >>>    SamplingParams(temperature=0.0))]
+            >>>
             >>> # Start the engine with an event loop
             >>> while True:
             >>>     if example_inputs:
-            >>>         request_id, example_prompt, sampling_params = example_inputs.pop(0)
-            >>>         engine.add_request(str(request_id), example_prompt, sampling_params)
+            >>>         req_id, prompt, sampling_params = example_inputs.pop(0)
+            >>>         engine.add_request(str(req_id), prompt, sampling_params)
             >>>
             >>>     # continue the request processing
             >>>     request_outputs = engine.step()

From 628a98a46f0e8adf7db7fd0928ad699152c16b6e Mon Sep 17 00:00:00 2001
From: Jiaxiang Yu <yujiaxiang2013@gmail.com>
Date: Tue, 12 Sep 2023 00:21:50 +0800
Subject: [PATCH 10/12] fix linting issues

---
 vllm/core/scheduler.py          |  2 +-
 vllm/engine/async_llm_engine.py | 12 ++++++------
 vllm/worker/worker.py           |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 7a0575b52c85..6a10692b6d56 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -90,7 +90,7 @@ def add_seq_group(self, seq_group: SequenceGroup) -> None:
     def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
         """Aborts a sequence group with the given ID.
 
-        Check if the sequence group with the given ID 
+        Check if the sequence group with the given ID
             is present in any of the state queue.
         If present, remove the sequence group from the state queue.
             Also, if any of the sequences in the sequence group is not finished,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 2463eedcf137..833a41c14c0a 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -420,16 +420,16 @@ async def generate(
         Details:
             - If the engine is not running, start the background loop,
               which iteratively invokes
-              :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step` 
+              :meth:`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
               to process the waiting requests.
             - Add the request to the engine's `RequestTracker`.
-              On the next background loop, this request will be sent to 
+              On the next background loop, this request will be sent to
               the underlying engine.
               Also, a corresponding `AsyncStream` will be created.
             - Wait for the request outputs from `AsyncStream` and yield them.
 
         Example:
-            >>> # Please refer to entrypoints/api_server.py for 
+            >>> # Please refer to entrypoints/api_server.py for
             >>> # the complete example.
             >>>
             >>> # initialize the engine and the example input
@@ -443,8 +443,8 @@ async def generate(
             >>>
             >>> # start the generation
             >>> results_generator = engine.generate(
-            >>>    example_input["prompt"], 
-            >>>    SamplingParams(temperature=example_input["temperature"]), 
+            >>>    example_input["prompt"],
+            >>>    SamplingParams(temperature=example_input["temperature"]),
             >>>    example_input["request_id"])
             >>>
             >>> # get the results
@@ -452,7 +452,7 @@ async def generate(
             >>> async for request_output in results_generator:
             >>>     if await request.is_disconnected():
             >>>         # Abort the request if the client disconnects.
-            >>>         await engine.abort(request_id) 
+            >>>         await engine.abort(request_id)
             >>>         # Return or raise an error
             >>>         ...
             >>>     final_output = request_output
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 0484b97f209a..b1562a568bcf 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -74,7 +74,7 @@ def profile_num_available_blocks(
         gpu_memory_utilization: float,
         cpu_swap_space: int,
     ) -> Tuple[int, int]:
-        """Profiles the peak memory usage of the model and returns the maximum 
+        """Profiles the peak memory usage of the model and returns the maximum
         number of GPU and CPU cache blocks that can be allocated.
 
         Args:

From 285348ad755c0e2e55346642e0a12d2043bbe3f4 Mon Sep 17 00:00:00 2001
From: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Date: Wed, 10 Jan 2024 11:49:39 -0800
Subject: [PATCH 11/12] add mock support

---
 docs/source/conf.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 695f6c380b18..1f073a9e9005 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -14,6 +14,10 @@
 import sys
 
 sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
+from sphinx.ext import autodoc
+import logging
+
+logger = logging.getLogger(__name__)
 
 # -- Project information -----------------------------------------------------
 
@@ -56,7 +60,6 @@
 html_theme = 'sphinx_book_theme'
 html_logo = 'assets/logos/vllm-logo-text-light.png'
 html_theme_options = {
-    'logo_only': True,
     'path_to_docs': 'docs/source',
     'repository_url': 'https://github.com/vllm-project/vllm',
     'use_repository_button': True,
@@ -65,4 +68,26 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+# html_static_path = ['_static']
+
+# Mock out external dependencies here.
+autodoc_mock_imports = ["torch", "transformers", "psutil", "vllm.cuda_utils"]
+
+for mock_target in autodoc_mock_imports:
+    if mock_target in sys.modules:
+        logger.info(
+            f"Potentially problematic mock target ({mock_target}) found; "
+            "autodoc_mock_imports cannot mock modules that have already "
+            "been loaded into sys.modules when the sphinx build starts.")
+
+
+class MockedClassDocumenter(autodoc.ClassDocumenter):
+    """Remove note about base class when a class is derived from object."""
+
+    def add_line(self, line: str, source: str, *lineno: int) -> None:
+        if line == "   Bases: :py:class:`object`":
+            return
+        super().add_line(line, source, *lineno)
+
+
+autodoc.ClassDocumenter = MockedClassDocumenter

From 834bc487fc321593bb2a1529765041147705d245 Mon Sep 17 00:00:00 2001
From: simon-mo <simon.mo@hey.com>
Date: Fri, 12 Jan 2024 03:20:00 +0000
Subject: [PATCH 12/12] fix capitalization

---
 docs/source/dev/engine/engine_index.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/dev/engine/engine_index.rst b/docs/source/dev/engine/engine_index.rst
index 75067486bd71..ba9ae55ddea4 100644
--- a/docs/source/dev/engine/engine_index.rst
+++ b/docs/source/dev/engine/engine_index.rst
@@ -1,4 +1,4 @@
-VLLM Engine
+vLLM Engine
 =================================
 
 .. automodule:: vllm.engine
@@ -7,7 +7,7 @@ VLLM Engine
 .. toctree::
    :maxdepth: 2
    :caption: Engines
-   
+
    llm_engine
    async_llm_engine