diff --git a/.github/assistant.py b/.github/assistant.py
index 68f9eeef8f7..c7d19a9c319 100644
--- a/.github/assistant.py
+++ b/.github/assistant.py
@@ -79,7 +79,7 @@ def set_min_torch_by_python(fpath: str = "requirements/base.txt") -> None:
             return
         with open(fpath) as fp:
             reqs = parse_requirements(fp.readlines())
-        pkg_ver = [p for p in reqs if p.name == "torch"][0]
+        pkg_ver = next(p for p in reqs if p.name == "torch")
         pt_ver = min([LooseVersion(v[1]) for v in pkg_ver.specs])
         pt_ver = max(LooseVersion(LUT_PYTHON_TORCH[py_ver]), pt_ver)
         with open(fpath) as fp:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 43de956fd86..c37fdaefcd7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,22 +38,22 @@ repos:
       - id: detect-private-key
 
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.9.0
+    rev: v3.14.0
     hooks:
       - id: pyupgrade
-        args: [--py38-plus]
+        args: ["--py38-plus"]
         name: Upgrade code
 
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.5
+    rev: v2.2.6
     hooks:
       - id: codespell
         additional_dependencies: [tomli]
-        #args: ["--write-changes"]
+        args: ["--write-changes"]
         exclude: pyproject.toml
 
   - repo: https://github.com/crate-ci/typos
-    rev: v1.16.12
+    rev: v1.16.17
     hooks:
       - id: typos
         # empty to do not write fixes
@@ -68,13 +68,13 @@ repos:
         args: ["--in-place"]
 
   - repo: https://github.com/psf/black
-    rev: 23.7.0
+    rev: 23.9.1
     hooks:
       - id: black
         name: Format code
 
   - repo: https://github.com/executablebooks/mdformat
-    rev: 0.7.16
+    rev: 0.7.17
     hooks:
       - id: mdformat
         additional_dependencies:
@@ -130,7 +130,7 @@ repos:
       - id: text-unicode-replacement-char
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.277
+    rev: v0.0.292
     hooks:
       - id: ruff
         args: ["--fix"]
diff --git a/pyproject.toml b/pyproject.toml
index 8fcbef0e92a..9114383c58a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,10 +22,8 @@ addopts = [
 #filterwarnings = ["error::FutureWarning"] # ToDo
 xfail_strict = true
 junit_duration_report = "call"
-
 [tool.coverage.report]
 exclude_lines = ["pragma: no cover", "pass"]
-
 [tool.coverage.run]
 parallel = true
 concurrency = "thread"
@@ -81,6 +79,7 @@ wil = "wil"
 
 
 [tool.ruff]
+target-version = "py38"
 line-length = 120
 # Enable Pyflakes `E` and `F` codes by default.
 select = [
@@ -122,6 +121,8 @@ ignore = [
     "S301",   # todo: `pickle` and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue  # todo
     "S310",   # todo: Audit URL open for permitted schemes. Allowing use of `file:` or custom schemes is often unexpected.  # todo
     "B905",   # todo: `zip()` without an explicit `strict=` parameter
+    "PYI024",  # todo: Use `typing.NamedTuple` instead of `collections.namedtuple`
+    "PYI041",  # todo: Use `float` instead of `int | float``
 ]
 # Exclude a variety of commonly ignored directories.
 exclude = [
diff --git a/requirements/test.txt b/requirements/test.txt
index 088f4caa4f4..dc63308e858 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,7 +1,7 @@
 # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
-coverage ==7.3.1
+coverage ==7.3.2
 pytest ==7.4.2
 pytest-cov ==4.1.0
 pytest-doctestplus ==1.0.0
@@ -14,4 +14,4 @@ requests <=2.31.0
 fire <=0.5.0
 
 cloudpickle >1.3, <=2.2.1
-scikit-learn >=1.1.1, <1.3.1
+scikit-learn >=1.1.1, <1.4.0
diff --git a/src/torchmetrics/audio/__init__.py b/src/torchmetrics/audio/__init__.py
index 1df6c22645e..31c01171c01 100644
--- a/src/torchmetrics/audio/__init__.py
+++ b/src/torchmetrics/audio/__init__.py
@@ -41,16 +41,16 @@
 ]
 
 if _PESQ_AVAILABLE:
-    from torchmetrics.audio.pesq import PerceptualEvaluationSpeechQuality  # noqa: F401
+    from torchmetrics.audio.pesq import PerceptualEvaluationSpeechQuality
 
-    __all__.append("PerceptualEvaluationSpeechQuality")
+    __all__ += ["PerceptualEvaluationSpeechQuality"]
 
 if _PYSTOI_AVAILABLE:
-    from torchmetrics.audio.stoi import ShortTimeObjectiveIntelligibility  # noqa: F401
+    from torchmetrics.audio.stoi import ShortTimeObjectiveIntelligibility
 
-    __all__.append("ShortTimeObjectiveIntelligibility")
+    __all__ += ["ShortTimeObjectiveIntelligibility"]
 
 if _GAMMATONE_AVAILABLE and _TORCHAUDIO_AVAILABLE and _TORCHAUDIO_GREATER_EQUAL_0_10:
-    from torchmetrics.audio.srmr import SpeechReverberationModulationEnergyRatio  # noqa: F401
+    from torchmetrics.audio.srmr import SpeechReverberationModulationEnergyRatio
 
-    __all__.append("SpeechReverberationModulationEnergyRatio")
+    __all__ += ["SpeechReverberationModulationEnergyRatio"]
diff --git a/src/torchmetrics/audio/stoi.py b/src/torchmetrics/audio/stoi.py
index 32c92f5a515..a60473066dd 100644
--- a/src/torchmetrics/audio/stoi.py
+++ b/src/torchmetrics/audio/stoi.py
@@ -34,7 +34,7 @@ class ShortTimeObjectiveIntelligibility(Metric):
     The STOI-measure is intrusive, i.e., a function of the clean and degraded speech signals. STOI may be a good
     alternative to the speech intelligibility index (SII) or the speech transmission index (STI), when you are
     interested in the effect of nonlinear processing to noisy speech, e.g., noise reduction, binary masking algorithms,
-    on speech intelligibility. Description taken from  `Cees Taal's website`_ and for further defails see `STOI ref1`_
+    on speech intelligibility. Description taken from  `Cees Taal's website`_ and for further details see `STOI ref1`_
     and `STOI ref2`_.
 
     This metric is a wrapper for the `pystoi package`_. As the implementation backend implementation only supports
diff --git a/src/torchmetrics/classification/accuracy.py b/src/torchmetrics/classification/accuracy.py
index 60188aff5c9..117a89cb667 100644
--- a/src/torchmetrics/classification/accuracy.py
+++ b/src/torchmetrics/classification/accuracy.py
@@ -49,7 +49,8 @@ class BinaryAccuracy(BinaryStatScores):
           If ``multidim_average`` is set to ``samplewise``, the metric returns ``(N,)`` vector consisting of a scalar
           value per sample.
 
-    Additional dimension ``...`` will be flattened into the batch dimension.
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
 
     Args:
         threshold: Threshold for transforming probability to binary {0,1} predictions
@@ -176,6 +177,9 @@ class MulticlassAccuracy(MulticlassStatScores):
               - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
               - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_classes: Integer specifying the number of classes
         average:
@@ -325,6 +329,9 @@ class MultilabelAccuracy(MultilabelStatScores):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_labels: Integer specifying the number of labels
         threshold: Threshold for transforming probability to binary (0,1) predictions
diff --git a/src/torchmetrics/classification/exact_match.py b/src/torchmetrics/classification/exact_match.py
index 481441a820c..20ab5344373 100644
--- a/src/torchmetrics/classification/exact_match.py
+++ b/src/torchmetrics/classification/exact_match.py
@@ -54,7 +54,6 @@ class MulticlassExactMatch(Metric):
       probabilities/logits into an int tensor.
     - ``target`` (:class:`~torch.Tensor`): An int tensor of shape ``(N, ...)``.
 
-
     As output to ``forward`` and ``compute`` the metric returns the following output:
 
     - ``mcem`` (:class:`~torch.Tensor`): A tensor whose returned shape depends on the ``multidim_average`` argument:
@@ -62,6 +61,9 @@ class MulticlassExactMatch(Metric):
         - If ``multidim_average`` is set to ``global`` the output will be a scalar tensor
         - If ``multidim_average`` is set to ``samplewise`` the output will be a tensor of shape ``(N,)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_classes: Integer specifying the number of labels
         multidim_average:
@@ -206,7 +208,6 @@ class MultilabelExactMatch(Metric):
       sigmoid per element. Additionally, we convert to int tensor with thresholding using the value in ``threshold``.
     - ``target`` (:class:`~torch.Tensor`): An int tensor of shape ``(N, C, ...)``.
 
-
     As output to ``forward`` and ``compute`` the metric returns the following output:
 
     - ``mlem`` (:class:`~torch.Tensor`): A tensor whose returned shape depends on the ``multidim_average`` argument:
@@ -214,6 +215,9 @@ class MultilabelExactMatch(Metric):
         - If ``multidim_average`` is set to ``global`` the output will be a scalar tensor
         - If ``multidim_average`` is set to ``samplewise`` the output will be a tensor of shape ``(N,)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_labels: Integer specifying the number of labels
         threshold: Threshold for transforming probability to binary (0,1) predictions
diff --git a/src/torchmetrics/classification/f_beta.py b/src/torchmetrics/classification/f_beta.py
index 16f8a1408b4..0386a8b2eb9 100644
--- a/src/torchmetrics/classification/f_beta.py
+++ b/src/torchmetrics/classification/f_beta.py
@@ -66,6 +66,9 @@ class BinaryFBetaScore(BinaryStatScores):
         - If ``multidim_average`` is set to ``samplewise`` the output will be a tensor of shape ``(N,)`` consisting of
           a scalar value per sample.
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         beta: Weighting between precision and recall in calculation. Setting to 1 corresponds to equal weight
         threshold: Threshold for transforming probability to binary {0,1} predictions
@@ -202,7 +205,6 @@ class MulticlassFBetaScore(MulticlassStatScores):
       probabilities/logits into an int tensor.
     - ``target`` (:class:`~torch.Tensor`): An int tensor of shape ``(N, ...)``.
 
-
     As output to ``forward`` and ``compute`` the metric returns the following output:
 
     - ``mcfbs`` (:class:`~torch.Tensor`): A tensor whose returned shape depends on the ``average`` and
@@ -218,6 +220,9 @@ class MulticlassFBetaScore(MulticlassStatScores):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         beta: Weighting between precision and recall in calculation. Setting to 1 corresponds to equal weight
         num_classes: Integer specifying the number of classes
@@ -382,7 +387,6 @@ class MultilabelFBetaScore(MultilabelStatScores):
       per element. Additionally, we convert to int tensor with thresholding using the value in ``threshold``.
     - ``target`` (:class:`~torch.Tensor`): An int tensor of shape ``(N, C, ...)``.
 
-
     As output to ``forward`` and ``compute`` the metric returns the following output:
 
     - ``mlfbs`` (:class:`~torch.Tensor`): A tensor whose returned shape depends on the ``average`` and
@@ -398,6 +402,9 @@ class MultilabelFBetaScore(MultilabelStatScores):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         beta: Weighting between precision and recall in calculation. Setting to 1 corresponds to equal weight
         num_labels: Integer specifying the number of labels
@@ -566,6 +573,9 @@ class BinaryF1Score(BinaryFBetaScore):
         - If ``multidim_average`` is set to ``samplewise``, the metric returns ``(N,)`` vector consisting of a scalar
           value per sample.
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         threshold: Threshold for transforming probability to binary {0,1} predictions
         multidim_average:
@@ -706,6 +716,9 @@ class MulticlassF1Score(MulticlassFBetaScore):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         preds: Tensor with predictions
         target: Tensor with true labels
@@ -876,6 +889,9 @@ class MultilabelF1Score(MultilabelFBetaScore):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)```
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_labels: Integer specifying the number of labels
         threshold: Threshold for transforming probability to binary (0,1) predictions
diff --git a/src/torchmetrics/classification/hamming.py b/src/torchmetrics/classification/hamming.py
index dd577d92b76..340a647aa8d 100644
--- a/src/torchmetrics/classification/hamming.py
+++ b/src/torchmetrics/classification/hamming.py
@@ -58,6 +58,9 @@ class BinaryHammingDistance(BinaryStatScores):
         - If ``multidim_average`` is set to ``samplewise``, the metric returns ``(N,)`` vector consisting of a
           scalar value per sample.
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         threshold: Threshold for transforming probability to binary {0,1} predictions
         multidim_average:
@@ -171,7 +174,6 @@ class MulticlassHammingDistance(MulticlassStatScores):
       probabilities/logits into an int tensor.
     - ``target`` (:class:`~torch.Tensor`): An int tensor of shape ``(N, ...)``.
 
-
     As output to ``forward`` and ``compute`` the metric returns the following output:
 
     - ``mchd`` (:class:`~torch.Tensor`): A tensor whose returned shape depends on the ``average`` and
@@ -187,6 +189,9 @@ class MulticlassHammingDistance(MulticlassStatScores):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_classes: Integer specifying the number of classes
         average:
@@ -324,7 +329,6 @@ class MultilabelHammingDistance(MultilabelStatScores):
       ``threshold``.
     - ``target`` (:class:`~torch.Tensor`): An int tensor of shape ``(N, C, ...)``.
 
-
     As output to ``forward`` and ``compute`` the metric returns the following output:
 
     - ``mlhd`` (:class:`~torch.Tensor`): A tensor whose returned shape depends on the ``average`` and
@@ -340,6 +344,9 @@ class MultilabelHammingDistance(MultilabelStatScores):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_labels: Integer specifying the number of labels
         threshold: Threshold for transforming probability to binary (0,1) predictions
diff --git a/src/torchmetrics/classification/precision_recall.py b/src/torchmetrics/classification/precision_recall.py
index d3530b4c769..d221584c336 100644
--- a/src/torchmetrics/classification/precision_recall.py
+++ b/src/torchmetrics/classification/precision_recall.py
@@ -57,6 +57,9 @@ class BinaryPrecision(BinaryStatScores):
       value. If ``multidim_average`` is set to ``samplewise``, the metric returns ``(N,)`` vector consisting of a
       scalar value per sample.
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         threshold: Threshold for transforming probability to binary {0,1} predictions
         multidim_average:
@@ -187,6 +190,9 @@ class MulticlassPrecision(MulticlassStatScores):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_classes: Integer specifying the number of classes
         average:
@@ -340,6 +346,9 @@ class MultilabelPrecision(MultilabelStatScores):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_labels: Integer specifying the number of labels
         threshold: Threshold for transforming probability to binary (0,1) predictions
@@ -479,6 +488,9 @@ class BinaryRecall(BinaryStatScores):
       value. If ``multidim_average`` is set to ``samplewise``, the metric returns ``(N,)`` vector consisting of
       a scalar value per sample.
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         threshold: Threshold for transforming probability to binary {0,1} predictions
         multidim_average:
@@ -608,6 +620,9 @@ class MulticlassRecall(MulticlassStatScores):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_classes: Integer specifying the number of classes
         average:
@@ -760,6 +775,9 @@ class MultilabelRecall(MultilabelStatScores):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_labels: Integer specifying the number of labels
         threshold: Threshold for transforming probability to binary (0,1) predictions
diff --git a/src/torchmetrics/classification/specificity.py b/src/torchmetrics/classification/specificity.py
index 31d736881cf..d9124968cfc 100644
--- a/src/torchmetrics/classification/specificity.py
+++ b/src/torchmetrics/classification/specificity.py
@@ -50,6 +50,9 @@ class BinarySpecificity(BinaryStatScores):
       If ``multidim_average`` is set to ``samplewise``, the metric returns ``(N,)`` vector consisting of a scalar value
       per sample.
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         threshold: Threshold for transforming probability to binary {0,1} predictions
         multidim_average:
@@ -174,6 +177,9 @@ class MulticlassSpecificity(MulticlassStatScores):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_classes: Integer specifying the number of classes
         average:
@@ -307,7 +313,6 @@ class MultilabelSpecificity(MultilabelStatScores):
       per element. Additionally, we convert to int tensor with thresholding using the value in ``threshold``.
     - ``target`` (:class:`~torch.Tensor`): An int tensor of shape ``(N, C, ...)``
 
-
     As output to ``forward`` and ``compute`` the metric returns the following output:
 
     - ``mls`` (:class:`~torch.Tensor`): The returned shape depends on the ``average`` and ``multidim_average``
@@ -323,6 +328,9 @@ class MultilabelSpecificity(MultilabelStatScores):
           - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N,)``
           - If ``average=None/'none'``, the shape will be ``(N, C)``
 
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
+
     Args:
         num_labels: Integer specifying the number of labels
         threshold: Threshold for transforming probability to binary (0,1) predictions
diff --git a/src/torchmetrics/classification/stat_scores.py b/src/torchmetrics/classification/stat_scores.py
index 7c72725d57a..ce671e202bf 100644
--- a/src/torchmetrics/classification/stat_scores.py
+++ b/src/torchmetrics/classification/stat_scores.py
@@ -107,8 +107,11 @@ class BinaryStatScores(_AbstractStatScores):
       to ``[tp, fp, tn, fn, sup]`` (``sup`` stands for support and equals ``tp + fn``). The shape
       depends on the ``multidim_average`` parameter:
 
-    - If ``multidim_average`` is set to ``global``, the shape will be ``(5,)``
-    - If ``multidim_average`` is set to ``samplewise``, the shape will be ``(N, 5)``
+      - If ``multidim_average`` is set to ``global``, the shape will be ``(5,)``
+      - If ``multidim_average`` is set to ``samplewise``, the shape will be ``(N, 5)``
+
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
 
     Args:
         threshold: Threshold for transforming probability to binary {0,1} predictions
@@ -208,12 +211,18 @@ class MulticlassStatScores(_AbstractStatScores):
       to ``[tp, fp, tn, fn, sup]`` (``sup`` stands for support and equals ``tp + fn``). The shape
       depends on ``average`` and ``multidim_average`` parameters:
 
-    - If ``multidim_average`` is set to ``global``
-    - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(5,)``
-    - If ``average=None/'none'``, the shape will be ``(C, 5)``
-    - If ``multidim_average`` is set to ``samplewise``
-    - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N, 5)``
-    - If ``average=None/'none'``, the shape will be ``(N, C, 5)``
+      - If ``multidim_average`` is set to ``global``:
+
+        - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(5,)``
+        - If ``average=None/'none'``, the shape will be ``(C, 5)``
+
+      - If ``multidim_average`` is set to ``samplewise``:
+
+        - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N, 5)``
+        - If ``average=None/'none'``, the shape will be ``(N, C, 5)``
+
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
 
     Args:
         num_classes: Integer specifying the number of classes
@@ -352,12 +361,18 @@ class MultilabelStatScores(_AbstractStatScores):
       to ``[tp, fp, tn, fn, sup]`` (``sup`` stands for support and equals ``tp + fn``). The shape
       depends on ``average`` and ``multidim_average`` parameters:
 
-    - If ``multidim_average`` is set to ``global``
-    - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(5,)``
-    - If ``average=None/'none'``, the shape will be ``(C, 5)``
-    - If ``multidim_average`` is set to ``samplewise``
-    - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N, 5)``
-    - If ``average=None/'none'``, the shape will be ``(N, C, 5)``
+      - If ``multidim_average`` is set to ``global``:
+
+        - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(5,)``
+        - If ``average=None/'none'``, the shape will be ``(C, 5)``
+
+      - If ``multidim_average`` is set to ``samplewise``:
+
+        - If ``average='micro'/'macro'/'weighted'``, the shape will be ``(N, 5)``
+        - If ``average=None/'none'``, the shape will be ``(N, C, 5)``
+
+    If ``multidim_average`` is set to ``samplewise`` we expect at least one additional dimension ``...`` to be present,
+    which the reduction will then be applied over instead of the sample dimension ``N``.
 
     Args:
         num_labels: Integer specifying the number of labels
diff --git a/src/torchmetrics/detection/mean_ap.py b/src/torchmetrics/detection/mean_ap.py
index 6cc80a62317..74d9d212a6d 100644
--- a/src/torchmetrics/detection/mean_ap.py
+++ b/src/torchmetrics/detection/mean_ap.py
@@ -879,7 +879,7 @@ def _get_coco_format(
                         f"Invalid input box of sample {image_id}, element {k} (expected 4 values, got {len(image_box)})"
                     )
 
-                if type(image_label) != int:
+                if not isinstance(image_label, int):
                     raise ValueError(
                         f"Invalid input class of sample {image_id}, element {k}"
                         f" (expected value of type integer, got type {type(image_label)})"
@@ -915,7 +915,7 @@ def _get_coco_format(
 
                 if scores is not None:
                     score = scores[image_id][k].cpu().tolist()
-                    if type(score) != float:
+                    if not isinstance(score, float):
                         raise ValueError(
                             f"Invalid input score of sample {image_id}, element {k}"
                             f" (expected value of type float, got type {type(score)})"
diff --git a/src/torchmetrics/functional/audio/__init__.py b/src/torchmetrics/functional/audio/__init__.py
index b6469c7aace..077442b0b83 100644
--- a/src/torchmetrics/functional/audio/__init__.py
+++ b/src/torchmetrics/functional/audio/__init__.py
@@ -42,16 +42,16 @@
 ]
 
 if _PESQ_AVAILABLE:
-    from torchmetrics.functional.audio.pesq import perceptual_evaluation_speech_quality  # noqa: F401
+    from torchmetrics.functional.audio.pesq import perceptual_evaluation_speech_quality
 
-    __all__.append("perceptual_evaluation_speech_quality")
+    __all__ += ["perceptual_evaluation_speech_quality"]
 
 if _PYSTOI_AVAILABLE:
-    from torchmetrics.functional.audio.stoi import short_time_objective_intelligibility  # noqa: F401
+    from torchmetrics.functional.audio.stoi import short_time_objective_intelligibility
 
-    __all__.append("short_time_objective_intelligibility")
+    __all__ += ["short_time_objective_intelligibility"]
 
 if _GAMMATONE_AVAILABLE and _TORCHAUDIO_AVAILABLE and _TORCHAUDIO_GREATER_EQUAL_0_10:
-    from torchmetrics.functional.audio.srmr import speech_reverberation_modulation_energy_ratio  # noqa: F401
+    from torchmetrics.functional.audio.srmr import speech_reverberation_modulation_energy_ratio
 
-    __all__.append("speech_reverberation_modulation_energy_ratio")
+    __all__ += ["speech_reverberation_modulation_energy_ratio"]
diff --git a/src/torchmetrics/functional/audio/sdr.py b/src/torchmetrics/functional/audio/sdr.py
index 9e47d9489c3..9f4043948b6 100644
--- a/src/torchmetrics/functional/audio/sdr.py
+++ b/src/torchmetrics/functional/audio/sdr.py
@@ -63,7 +63,7 @@ def _symmetric_toeplitz(vector: Tensor) -> Tensor:
 def _compute_autocorr_crosscorr(target: Tensor, preds: Tensor, corr_len: int) -> Tuple[Tensor, Tensor]:
     r"""Compute the auto correlation of `target` and the cross correlation of `target` and `preds`.
 
-    This calculation is done using the fast Fourier transform (FFT). Let's denotes the symmetric Toeplitz matric of the
+    This calculation is done using the fast Fourier transform (FFT). Let's denotes the symmetric Toeplitz metric of the
     auto correlation of `target` as `R`, the cross correlation as 'b', then solving the equation `Rh=b` could have `h`
     as the coordinate of `preds` in the column space of the `corr_len` shifts of `target`.
 
@@ -81,7 +81,7 @@ def _compute_autocorr_crosscorr(target: Tensor, preds: Tensor, corr_len: int) ->
     n_fft = 2 ** math.ceil(math.log2(preds.shape[-1] + target.shape[-1] - 1))
 
     # computes the auto correlation of `target`
-    # r_0 is the first row of the symmetric Toeplitz matric
+    # r_0 is the first row of the symmetric Toeplitz metric
     t_fft = torch.fft.rfft(target, n=n_fft, dim=-1)
     r_0 = torch.fft.irfft(t_fft.real**2 + t_fft.imag**2, n=n_fft)[..., :corr_len]
 
diff --git a/src/torchmetrics/functional/audio/stoi.py b/src/torchmetrics/functional/audio/stoi.py
index 56d077a4982..81d7a6ffda7 100644
--- a/src/torchmetrics/functional/audio/stoi.py
+++ b/src/torchmetrics/functional/audio/stoi.py
@@ -35,7 +35,7 @@ def short_time_objective_intelligibility(
     STOI-measure is intrusive, i.e., a function of the clean and degraded speech signals. STOI may be a good alternative
     to the speech intelligibility index (SII) or the speech transmission index (STI), when you are interested in
     the effect of nonlinear processing to noisy speech, e.g., noise reduction, binary masking algorithms, on speech
-    intelligibility. Description taken from  `Cees Taal's website`_ and for further defails see `STOI ref1`_ and
+    intelligibility. Description taken from  `Cees Taal's website`_ and for further details see `STOI ref1`_ and
     `STOI ref2`_.
 
     This metric is a wrapper for the `pystoi package`_. As the implementation backend implementation only supports
diff --git a/src/torchmetrics/functional/detection/__init__.py b/src/torchmetrics/functional/detection/__init__.py
index 85a2d12e39c..8f818c7b2df 100644
--- a/src/torchmetrics/functional/detection/__init__.py
+++ b/src/torchmetrics/functional/detection/__init__.py
@@ -22,15 +22,13 @@
 __all__ = ["modified_panoptic_quality", "panoptic_quality"]
 
 if _TORCHVISION_AVAILABLE and _TORCHVISION_GREATER_EQUAL_0_8:
-    from torchmetrics.functional.detection.giou import generalized_intersection_over_union  # noqa: F401
-    from torchmetrics.functional.detection.iou import intersection_over_union  # noqa: F401
+    from torchmetrics.functional.detection.giou import generalized_intersection_over_union
+    from torchmetrics.functional.detection.iou import intersection_over_union
 
-    __all__.append("generalized_intersection_over_union")
-    __all__.append("intersection_over_union")
+    __all__ += ["generalized_intersection_over_union", "intersection_over_union"]
 
 if _TORCHVISION_AVAILABLE and _TORCHVISION_GREATER_EQUAL_0_13:
-    from torchmetrics.functional.detection.ciou import complete_intersection_over_union  # noqa: F401
-    from torchmetrics.functional.detection.diou import distance_intersection_over_union  # noqa: F401
+    from torchmetrics.functional.detection.ciou import complete_intersection_over_union
+    from torchmetrics.functional.detection.diou import distance_intersection_over_union
 
-    __all__.append("complete_intersection_over_union")
-    __all__.append("distance_intersection_over_union")
+    __all__ += ["complete_intersection_over_union", "distance_intersection_over_union"]
diff --git a/src/torchmetrics/functional/image/ssim.py b/src/torchmetrics/functional/image/ssim.py
index 3f7bc7fcb4b..d0e9d15c6dc 100644
--- a/src/torchmetrics/functional/image/ssim.py
+++ b/src/torchmetrics/functional/image/ssim.py
@@ -479,7 +479,7 @@ def multiscale_structural_similarity_index_measure(
             the range is calculated as the difference and input is clamped between the values.
         k1: Parameter of structural similarity index measure.
         k2: Parameter of structural similarity index measure.
-        betas: Exponent parameters for individual similarities and contrastive sensitivies returned by different image
+        betas: Exponent parameters for individual similarities and contrastive sensitivities returned by different image
             resolutions.
         normalize: When MultiScaleSSIM loss is used for training, it is desirable to use normalizes to improve the
             training stability. This `normalize` argument is out of scope of the original implementation [1], and it is
diff --git a/src/torchmetrics/functional/text/__init__.py b/src/torchmetrics/functional/text/__init__.py
index 504a453b79e..9282be6fbae 100644
--- a/src/torchmetrics/functional/text/__init__.py
+++ b/src/torchmetrics/functional/text/__init__.py
@@ -47,8 +47,7 @@
 
 
 if _TRANSFORMERS_GREATER_EQUAL_4_4:
-    from torchmetrics.functional.text.bert import bert_score  # noqa: F401
-    from torchmetrics.functional.text.infolm import infolm  # noqa: F401
+    from torchmetrics.functional.text.bert import bert_score
+    from torchmetrics.functional.text.infolm import infolm
 
-    __all__.append("bert_score")
-    __all__.append("infolm")
+    __all__ += ["bert_score", "infolm"]
diff --git a/src/torchmetrics/functional/text/chrf.py b/src/torchmetrics/functional/text/chrf.py
index 4ed62c7fc49..1b6baca5f76 100644
--- a/src/torchmetrics/functional/text/chrf.py
+++ b/src/torchmetrics/functional/text/chrf.py
@@ -268,7 +268,7 @@ def _calculate_fscore(
         beta: A parameter determining an importance of recall w.r.t. precision. If `beta=1`, their importance is equal.
 
     Return:
-        A chrF/chrF++ score. This function is universal both for sentence-level and corpus-level calucation.
+        A chrF/chrF++ score. This function is universal both for sentence-level and corpus-level calculation.
 
     """
 
diff --git a/src/torchmetrics/functional/text/eed.py b/src/torchmetrics/functional/text/eed.py
index de5c50fa786..f6f562f5338 100644
--- a/src/torchmetrics/functional/text/eed.py
+++ b/src/torchmetrics/functional/text/eed.py
@@ -145,7 +145,7 @@ def _eed_function(
     next_row = [inf] * (len(hyp) + 1)
 
     for w in range(1, len(ref) + 1):
-        for i in range(0, len(hyp) + 1):
+        for i in range(len(hyp) + 1):
             if i > 0:
                 next_row[i] = min(
                     next_row[i - 1] + deletion,
@@ -252,7 +252,7 @@ def _eed_compute(sentence_level_scores: List[Tensor]) -> Tensor:
 def _preprocess_sentences(
     preds: Union[str, Sequence[str]],
     target: Sequence[Union[str, Sequence[str]]],
-    language: Union[Literal["en"], Literal["ja"]],
+    language: Literal["en", "ja"],
 ) -> Tuple[Union[str, Sequence[str]], Sequence[Union[str, Sequence[str]]]]:
     """Preprocess strings according to language requirements.
 
diff --git a/src/torchmetrics/functional/text/helper.py b/src/torchmetrics/functional/text/helper.py
index 4fe72fcf635..d4c9ff7ae04 100644
--- a/src/torchmetrics/functional/text/helper.py
+++ b/src/torchmetrics/functional/text/helper.py
@@ -242,7 +242,7 @@ def _add_cache(self, prediction_tokens: List[str], edit_distance: List[List[Tupl
             node = value[0]  # type: ignore
 
     def _find_cache(self, prediction_tokens: List[str]) -> Tuple[int, List[List[Tuple[int, _EditOperations]]]]:
-        """Find the already calculated rows of the Levenshtein edit distance matric.
+        """Find the already calculated rows of the Levenshtein edit distance metric.
 
         Args:
             prediction_tokens: A tokenized predicted sentence.
diff --git a/src/torchmetrics/functional/text/rouge.py b/src/torchmetrics/functional/text/rouge.py
index ff04f76cd2c..4bd7e27bc10 100644
--- a/src/torchmetrics/functional/text/rouge.py
+++ b/src/torchmetrics/functional/text/rouge.py
@@ -490,7 +490,7 @@ def rouge_score(
     if not isinstance(rouge_keys, tuple):
         rouge_keys = (rouge_keys,)
     for key in rouge_keys:
-        if key not in ALLOWED_ROUGE_KEYS.keys():
+        if key not in ALLOWED_ROUGE_KEYS:
             raise ValueError(f"Got unknown rouge key {key}. Expected to be one of {list(ALLOWED_ROUGE_KEYS.keys())}")
     rouge_keys_values = [ALLOWED_ROUGE_KEYS[key] for key in rouge_keys]
 
diff --git a/src/torchmetrics/functional/text/sacre_bleu.py b/src/torchmetrics/functional/text/sacre_bleu.py
index ff34fb174b4..af247be76d5 100644
--- a/src/torchmetrics/functional/text/sacre_bleu.py
+++ b/src/torchmetrics/functional/text/sacre_bleu.py
@@ -333,7 +333,7 @@ def sacre_bleu_score(
     if tokenize not in AVAILABLE_TOKENIZERS:
         raise ValueError(f"Argument `tokenize` expected to be one of {AVAILABLE_TOKENIZERS} but got {tokenize}.")
 
-    if tokenize not in _SacreBLEUTokenizer._TOKENIZE_FN.keys():
+    if tokenize not in _SacreBLEUTokenizer._TOKENIZE_FN:
         raise ValueError(
             f"Unsupported tokenizer selected. Please, choose one of {list(_SacreBLEUTokenizer._TOKENIZE_FN.keys())}"
         )
diff --git a/src/torchmetrics/functional/text/squad.py b/src/torchmetrics/functional/text/squad.py
index 2440333c61f..01dfb4ec0e6 100644
--- a/src/torchmetrics/functional/text/squad.py
+++ b/src/torchmetrics/functional/text/squad.py
@@ -119,7 +119,7 @@ def _squad_input_check(
             )
 
         answers: Dict[str, Union[List[str], List[int]]] = target["answers"]  # type: ignore[assignment]
-        if "text" not in answers.keys():
+        if "text" not in answers:
             raise KeyError(
                 "Expected keys in a 'answers' are 'text'."
                 "Please make sure that 'answer' maps to a `SQuAD` format dictionary.\n"
diff --git a/src/torchmetrics/image/ssim.py b/src/torchmetrics/image/ssim.py
index ac0808ea653..5056589fd14 100644
--- a/src/torchmetrics/image/ssim.py
+++ b/src/torchmetrics/image/ssim.py
@@ -249,7 +249,7 @@ class MultiScaleStructuralSimilarityIndexMeasure(Metric):
             The ``data_range`` must be given when ``dim`` is not None.
         k1: Parameter of structural similarity index measure.
         k2: Parameter of structural similarity index measure.
-        betas: Exponent parameters for individual similarities and contrastive sensitivies returned by different image
+        betas: Exponent parameters for individual similarities and contrastive sensitivities returned by different image
             resolutions.
         normalize: When MultiScaleStructuralSimilarityIndexMeasure loss is used for training, it is desirable to use
             normalizes to improve the training stability. This `normalize` argument is out of scope of the original
diff --git a/src/torchmetrics/retrieval/fall_out.py b/src/torchmetrics/retrieval/fall_out.py
index 52a00298665..eea6283898d 100644
--- a/src/torchmetrics/retrieval/fall_out.py
+++ b/src/torchmetrics/retrieval/fall_out.py
@@ -40,7 +40,7 @@ class RetrievalFallOut(RetrievalMetric):
 
     As output to ``forward`` and ``compute`` the metric returns the following output:
 
-    - ``fo@k`` (:class:`~torch.Tensor`): A tensor with the computed metric
+    - ``fallout@k`` (:class:`~torch.Tensor`): A tensor with the computed metric
 
     All ``indexes``, ``preds`` and ``target`` must have the same dimension and will be flatten at the beginning,
     so that for example, a tensor of shape ``(N, M)`` is treated as ``(N * M, )``. Predictions will be first grouped by
diff --git a/src/torchmetrics/text/__init__.py b/src/torchmetrics/text/__init__.py
index 52d01026f5a..48807a98fc4 100644
--- a/src/torchmetrics/text/__init__.py
+++ b/src/torchmetrics/text/__init__.py
@@ -45,8 +45,7 @@
 ]
 
 if _TRANSFORMERS_GREATER_EQUAL_4_4:
-    from torchmetrics.text.bert import BERTScore  # noqa: F401
-    from torchmetrics.text.infolm import InfoLM  # noqa: F401
+    from torchmetrics.text.bert import BERTScore
+    from torchmetrics.text.infolm import InfoLM
 
-    __all__.append("BERTScore")
-    __all__.append("InfoLM")
+    __all__ += ["BERTScore", "InfoLM"]
diff --git a/src/torchmetrics/wrappers/tracker.py b/src/torchmetrics/wrappers/tracker.py
index 6809fdb115f..7e9913b23a9 100644
--- a/src/torchmetrics/wrappers/tracker.py
+++ b/src/torchmetrics/wrappers/tracker.py
@@ -152,7 +152,7 @@ def compute_all(self) -> Any:
         """Compute the metric value for all tracked metrics.
 
         Return:
-            By default will try stacking the results from all increaments into a single tensor if the tracked base
+            By default will try stacking the results from all increments into a single tensor if the tracked base
             object is a single metric. If a metric collection is provided a dict of stacked tensors will be returned.
             If the stacking process fails a list of the computed results will be returned.
 
diff --git a/tests/unittests/bases/test_composition.py b/tests/unittests/bases/test_composition.py
index 19863276bed..f33d37f2015 100644
--- a/tests/unittests/bases/test_composition.py
+++ b/tests/unittests/bases/test_composition.py
@@ -67,7 +67,7 @@ def test_metrics_add(second_operand, expected_result):
 
 @pytest.mark.parametrize(
     ("second_operand", "expected_result"),
-    [(DummyMetric(3), tensor(2)), (3, tensor(2)), (3, tensor(2)), (tensor(3), tensor(2))],
+    [(DummyMetric(3), tensor(2)), (3, tensor(2)), (tensor(3), tensor(2))],
 )
 def test_metrics_and(second_operand, expected_result):
     """Test that `and` operator works and returns a compositional metric."""
diff --git a/tests/unittests/text/test_rouge.py b/tests/unittests/text/test_rouge.py
index e2d31418e03..fe1ba4cbfcd 100644
--- a/tests/unittests/text/test_rouge.py
+++ b/tests/unittests/text/test_rouge.py
@@ -73,7 +73,7 @@ def _compute_rouge_score(
         aggregator_avg = BootstrapAggregator()
 
         if accumulate == "best":
-            key_curr = list(list_results[0].keys())[0]
+            key_curr = next(iter(list_results[0].keys()))
             all_fmeasure = torch.tensor([v[key_curr].fmeasure for v in list_results])
             highest_idx = torch.argmax(all_fmeasure).item()
             aggregator.add_scores(list_results[highest_idx])