Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions vllm/model_executor/layers/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ class ReplicatedLinear(LinearBase):
quant_config: Quantization configure.
prefix: The name of the layer in the state dict, including all parents
(e.g. model.layers.0.qkv_proj)
return_bias: If true, return bias together with outputs in forward pass.
"""

def __init__(
Expand Down Expand Up @@ -523,6 +524,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
quant_config: Quantization configure.
prefix: The name of the layer in the state dict, including all parents
(e.g. model.layers.0.qkv_proj)
return_bias: If true, return bias together with outputs in forward pass.
"""

def __init__(
Expand Down Expand Up @@ -805,6 +807,7 @@ class QKVParallelLinear(ColumnParallelLinear):
quant_config: Quantization configure.
prefix: The name of the layer in the state dict, including all parents
(e.g. model.layers.0.qkv_proj)
return_bias: If true, return bias together with outputs in forward pass.
"""

def __init__(
Expand Down Expand Up @@ -1155,7 +1158,13 @@ class RowParallelLinear(LinearBase):
bias can be fused with other element-wise operations.
We skip adding bias but instead return it.
params_dtype: Data type for the parameters.
reduce_results: If true, call all-reduce on output and make Y available
to all GPUs, otherwise, every GPU will have its output
which is Y = X_iA_i
quant_config: Quantization configure.
prefix: The name of the layer in the state dict, including all parents
(e.g. model.layers.0.down_proj)
return_bias: If true, return bias together with outputs in forward pass.
"""

def __init__(
Expand Down