Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 78 additions & 7 deletions crates/ruff_db/src/parsed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@ use std::sync::Arc;

use arc_swap::ArcSwapOption;
use get_size2::GetSize;
use ruff_python_ast::{AnyRootNodeRef, ModModule, NodeIndex};
use ruff_python_parser::{ParseOptions, Parsed, parse_unchecked};
use ruff_python_ast::{
AnyRootNodeRef, HasNodeIndex, ModExpression, ModModule, NodeIndex, NodeIndexError,
StringLiteral,
};
use ruff_python_parser::{
ParseError, ParseErrorType, ParseOptions, Parsed, parse_string_annotation, parse_unchecked,
};

use crate::Db;
use crate::files::File;
Expand Down Expand Up @@ -45,6 +50,36 @@ pub fn parsed_module_impl(db: &dyn Db, file: File) -> Parsed<ModModule> {
.expect("PySourceType always parses into a module")
}

pub fn parsed_string_annotation(
source: &str,
string: &StringLiteral,
) -> Result<Parsed<ModExpression>, ParseError> {
let expr = parse_string_annotation(source, string)?;

// We need the sub-ast of the string annotation to be indexed
indexed::ensure_indexed(&expr, string.node_index().load()).map_err(|err| {
let message = match err {
NodeIndexError::NoParent => {
"Internal Error: string annotation's parent had no NodeIndex".to_owned()
}
NodeIndexError::OutOfIndices => {
"File too long, ran out of encoding space for string annotations".to_owned()
}
NodeIndexError::OutOfSubIndices => {
"Substring annotation is too complex, ran out of encoding space".to_owned()
}
NodeIndexError::TooNested => "Too many levels of nested string annotations".to_owned(),
};

ParseError {
error: ParseErrorType::OtherError(message),
location: string.range,
}
})?;

Ok(expr)
}

/// A wrapper around a parsed module.
///
/// This type manages instances of the module AST. A particular instance of the AST
Expand Down Expand Up @@ -169,13 +204,39 @@ mod indexed {
pub parsed: Parsed<ModModule>,
}

/// Ensure the following sub-AST is indexed, using the parent node's index
/// as a basis for unambiguous AST node indices.
pub fn ensure_indexed(
parsed: &Parsed<ModExpression>,
parent_node_index: NodeIndex,
) -> Result<(), NodeIndexError> {
let parent_index = parent_node_index.as_u32().ok_or(NodeIndexError::NoParent)?;
let (index, max_index) = sub_indices(parent_index)?;
let mut visitor = Visitor {
overflowed: false,
nodes: None,
index,
max_index,
};

AnyNodeRef::from(parsed.syntax()).visit_source_order(&mut visitor);

if visitor.overflowed {
return Err(NodeIndexError::OutOfSubIndices);
}

Ok(())
}

impl IndexedModule {
/// Create a new [`IndexedModule`] from the given AST.
#[allow(clippy::unnecessary_cast)]
pub fn new(parsed: Parsed<ModModule>) -> Arc<Self> {
let mut visitor = Visitor {
nodes: Vec::new(),
nodes: Some(Vec::new()),
index: 0,
max_index: MAX_REAL_INDEX,
overflowed: false,
};

let mut inner = Arc::new(IndexedModule {
Expand All @@ -185,7 +246,7 @@ mod indexed {

AnyNodeRef::from(inner.parsed.syntax()).visit_source_order(&mut visitor);

let index: Box<[AnyRootNodeRef<'_>]> = visitor.nodes.into_boxed_slice();
let index: Box<[AnyRootNodeRef<'_>]> = visitor.nodes.unwrap().into_boxed_slice();

// SAFETY: We cast from `Box<[AnyRootNodeRef<'_>]>` to `Box<[AnyRootNodeRef<'static>]>`,
// faking the 'static lifetime to create the self-referential struct. The node references
Expand Down Expand Up @@ -214,7 +275,9 @@ mod indexed {
/// A visitor that collects nodes in source order.
pub struct Visitor<'a> {
pub index: u32,
pub nodes: Vec<AnyRootNodeRef<'a>>,
pub max_index: u32,
pub nodes: Option<Vec<AnyRootNodeRef<'a>>>,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to always be set to Some?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops, good catch! (It should be None for the new path, since we don't care about that result at all)

pub overflowed: bool,
}

impl<'a> Visitor<'a> {
Expand All @@ -223,8 +286,16 @@ mod indexed {
T: HasNodeIndex + std::fmt::Debug,
AnyRootNodeRef<'a>: From<&'a T>,
{
node.node_index().set(NodeIndex::from(self.index));
self.nodes.push(AnyRootNodeRef::from(node));
// Only check on write (the maximum is orders of magnitude less than u32::MAX)
if self.index > self.max_index {
self.overflowed = true;
} else {
node.node_index().set(NodeIndex::from(self.index));
}

if let Some(nodes) = &mut self.nodes {
nodes.push(AnyRootNodeRef::from(node));
}
self.index += 1;
}
}
Expand Down
92 changes: 92 additions & 0 deletions crates/ruff_python_ast/src/node_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,45 @@ where
}

/// A unique index for a node within an AST.
///
/// Our encoding of 32-bit AST node indices is as follows:
///
/// * `u32::MAX` (1111...1) is reserved as a forbidden value (mapped to 0 for `NonZero`)
/// * `u32::MAX - 1` (1111...0) is reserved for `NodeIndex::NONE`
/// * The top two bits encode the sub-AST level:
/// * 00 is top-level AST
/// * 01 is sub-AST (string annotation)
/// * 10 is sub-sub-AST (string annotation in string annotation)
/// * 11 is forbidden (well, it only appears in the above reserved values)
/// * The remaining 30 bits are the real (sub)-AST node index
///
/// To get the first sub-index of a node's sub-AST we:
///
/// * increment the sub-AST level in the high-bits
/// * at level 1, multiply the real index by 256
/// * at level 2, multiply the real index by 8
///
/// The multiplication gives each node a reserved space of 256 nodes for its sub-AST
/// to work with ("should be enough for anybody"), and 8 nodes for a sub-sub-AST
/// (enough for an identifier and maybe some simple unions).
///
/// Here are some implications:
///
/// * We have 2^30 top-level AST nodes (1 billion)
/// * To have a string annotation, the parent node needs to be multiplied by 256 without
/// overflowing 30 bits, so string annotations cannot be used after 2^22 nodes (4 million),
/// which would be like, a million lines of code.
/// * To have a sub-string annotation, the top-level node needs to be multiplied
/// by 256 * 8, so sub-string annotations cannot be used after 2^19 nodes (500 thousand),
/// or about 100k lines of code.
///
/// This feels like a pretty reasonable compromise that will work well in practice,
/// although it creates some very wonky boundary conditions that will be very unpleasant
/// if someone runs into them.
///
/// That said, string annotations are in many regards "legacy" and so new code ideally
/// doesn't have to use them, and there's never a real reason to use sub-annotation
/// let-alone a sub-sub-annotation.
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
#[cfg_attr(feature = "get-size", derive(get_size2::GetSize))]
pub struct NodeIndex(NonZeroU32);
Expand All @@ -39,6 +78,59 @@ impl NodeIndex {
}
}

pub enum NodeIndexError {
TooNested,
OutOfSubIndices,
NoParent,
OutOfIndices,
}

const MAX_LEVEL: u32 = 2;
const LEVEL_BITS: u32 = 32 - MAX_LEVEL.leading_zeros();
const LEVEL_SHIFT: u32 = 32 - LEVEL_BITS;
const LEVEL_MASK: u32 = ((LEVEL_BITS << 1) - 1) << LEVEL_SHIFT;
const SUB_NODES: u32 = 256;
const SUB_SUB_NODES: u32 = 8;
pub const MAX_REAL_INDEX: u32 = (1 << LEVEL_SHIFT) - 1;

/// sub-AST level is stored in the top two bits
fn sub_ast_level(index: u32) -> u32 {
(index & LEVEL_MASK) >> LEVEL_SHIFT
}

/// Get the first and last index of the sub-AST of the input
pub fn sub_indices(index: u32) -> Result<(u32, u32), NodeIndexError> {
let level = sub_ast_level(index);
if level >= MAX_LEVEL {
return Err(NodeIndexError::TooNested);
}
let next_level = (level + 1) << LEVEL_SHIFT;
let without_level = index & !LEVEL_MASK;
let nodes_in_level = if level == 0 {
SUB_NODES
} else if level == 1 {
SUB_SUB_NODES
} else {
unreachable!(
"Someone made a mistake updating the encoding of node indices: {index:08X} had level {level}"
);
};

// If this overflows the file has hundreds of thousands of lines of code,
// but that *can* happen (we just can't support string annotations that deep)
let sub_index_without_level = without_level
.checked_mul(SUB_NODES)
.ok_or(NodeIndexError::OutOfIndices)?;
if sub_index_without_level > MAX_REAL_INDEX {
return Err(NodeIndexError::OutOfIndices);
}

let first_index = sub_index_without_level | next_level;
// Can't overflow by construction
let last_index = first_index + nodes_in_level - 1;
Ok((first_index, last_index))
}

impl From<u32> for NodeIndex {
fn from(value: u32) -> Self {
match NonZeroU32::new(value + 1).map(NodeIndex) {
Expand Down
12 changes: 6 additions & 6 deletions crates/ty/docs/rules.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading