Eventual-Inc · Vince7778 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py
@@ -655,7 +655,7 @@ def approx_percentiles(self, percentiles: builtins.float | builtins.list[builtin
             │ ---                 ┆ ---                            │
             │ Float64             ┆ FixedSizeList[Float64; 3]      │
             ╞═════════════════════╪════════════════════════════════╡
-            │ 2.9742334234767167  ┆ [1.993661701417351, 2.9742334… │
+            │ 2.9742334234767163  ┆ [1.993661701417351, 2.9742334… │
             ╰─────────────────────┴────────────────────────────────╯
             <BLANKLINE>
             (Showing first 1 of 1 rows)

diff --git a/rust-toolchain.toml b/rust-toolchain.toml
@@ -1,4 +1,4 @@
 [toolchain]
-channel = "nightly-2024-04-01"
+channel = "nightly-2024-08-01"
 components = ["rustfmt", "clippy"]
 profile = "minimal"
diff --git a/src/arrow2/Cargo.toml b/src/arrow2/Cargo.toml
@@ -158,6 +158,7 @@ full = [
   "arrow",
   "io_csv",
   "io_csv_async",
+  "io_flight",
   "io_json",
   "io_ipc",
   "io_ipc_write_async",
@@ -179,6 +180,7 @@ io_csv_async = ["io_csv_read_async"]
 io_csv_read = ["csv", "lexical-core"]
 io_csv_read_async = ["csv-async", "lexical-core", "futures"]
 io_csv_write = ["csv-core", "streaming-iterator", "lexical-core"]
+io_flight = ["arrow-format/flight-data"]
 io_ipc = ["arrow-format"]
 io_ipc_compression = ["lz4", "zstd"]
 io_ipc_read_async = ["io_ipc", "futures", "async-stream"]

diff --git a/src/arrow2/src/array/dictionary/typed_iterator.rs b/src/arrow2/src/array/dictionary/typed_iterator.rs
@@ -37,13 +37,12 @@ impl<O: Offset> DictValue for Utf8Array<O> {
             .ok_or(Error::InvalidArgumentError(
                 "could not convert array to dictionary value".into(),
             ))
-            .map(|arr| {
+            .inspect(|arr| {
                 assert_eq!(
                     arr.null_count(),
                     0,
                     "null values in values not supported in iteration"
                 );
-                arr
             })
     }
 }

diff --git a/src/arrow2/src/array/mod.rs b/src/arrow2/src/array/mod.rs
@@ -10,6 +10,7 @@
 //! * [`BinaryArray`] and [`MutableBinaryArray`], an array of opaque variable length values
 //! * [`ListArray`] and [`MutableListArray`], an array of arrays (e.g. `[[1, 2], None, [], [None]]`)
 //! * [`StructArray`] and [`MutableStructArray`], an array of arrays identified by a string (e.g. `{"a": [1, 2], "b": [true, false]}`)
+//!
 //! All immutable arrays implement the trait object [`Array`] and that can be downcasted
 //! to a concrete struct based on [`PhysicalType`](crate::datatypes::PhysicalType) available from [`Array::data_type`].
 //! All immutable arrays are backed by [`Buffer`](crate::buffer::Buffer) and thus cloning and slicing them is `O(1)`.

diff --git a/src/arrow2/src/bitmap/utils/zip_validity.rs b/src/arrow2/src/bitmap/utils/zip_validity.rs
@@ -91,6 +91,7 @@ where
 /// This enum can be used in two distinct ways:
 /// * as an iterator, via `Iterator::next`
 /// * as an enum of two iterators, via `match self`
+///
 /// The latter allows specializalizing to when there are no nulls
 #[derive(Debug, Clone)]
 pub enum ZipValidity<T, I, V>

diff --git a/src/arrow2/src/buffer/immutable.rs b/src/arrow2/src/buffer/immutable.rs
@@ -1,4 +1,4 @@
-use std::{ops::Deref, sync::Arc, usize};
+use std::{ops::Deref, sync::Arc};
 
 use either::Either;
 

diff --git a/src/arrow2/src/compute/cast/mod.rs b/src/arrow2/src/compute/cast/mod.rs
@@ -487,6 +487,7 @@ fn cast_list_to_fixed_size_list<O: Offset>(
 /// * Time32 and Time64: precision lost when going to higher interval
 /// * Timestamp and Date{32|64}: precision lost when going to higher interval
 /// * Temporal to/from backing primitive: zero-copy with data type change
+///
 /// Unsupported Casts
 /// * To or from `StructArray`
 /// * List to primitive

diff --git a/src/arrow2/src/compute/comparison/mod.rs b/src/arrow2/src/compute/comparison/mod.rs
@@ -7,6 +7,7 @@
 //! The functions are organized in two variants:
 //! * statically typed
 //! * dynamically typed
+//!
 //! The statically typed are available under each module of this module (e.g. [`primitive::eq`], [`primitive::lt_scalar`])
 //! The dynamically typed are available in this module (e.g. [`eq`] or [`lt_scalar`]).
 //!

diff --git a/src/arrow2/src/compute/sort/row/interner.rs b/src/arrow2/src/compute/sort/row/interner.rs
@@ -50,16 +50,7 @@ trait HashSingle: BuildHasher {
     where
         Self: Sized,
     {
-        // Rewrite as `hasher.hash_one(&x)` after
-        // https://github.com/rust-lang/rust/issues/86161 is merged.
-        #[cfg(feature = "nightly_build")]
-        {
-            self.hash_one(x)
-        }
-        #[cfg(not(feature = "nightly_build"))]
-        {
-            self.hash_one(&x)
-        }
+        self.hash_one(&x)
     }
 }
 

diff --git a/src/arrow2/src/datatypes/mod.rs b/src/arrow2/src/datatypes/mod.rs
@@ -66,6 +66,7 @@ pub enum DataType {
     /// * As used in the Olson time zone database (the "tz database" or
     ///   "tzdata"), such as "America/New_York"
     /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+    ///
     /// When the timezone is not specified, the timestamp is considered to have no timezone
     /// and is represented _as is_
     Timestamp(TimeUnit, Option<String>),

diff --git a/src/arrow2/src/io/csv/write/mod.rs b/src/arrow2/src/io/csv/write/mod.rs
@@ -103,6 +103,6 @@ where
             .join(std::str::from_utf8(&[options.delimiter]).unwrap())
             .as_bytes(),
     )?;
-    writer.write_all(&[b'\n'])?;
+    writer.write_all(b"\n")?;
     Ok(())
 }
diff --git a/src/arrow2/src/io/json/write/mod.rs b/src/arrow2/src/io/json/write/mod.rs
@@ -141,15 +141,15 @@ where
     W: std::io::Write,
     I: FallibleStreamingIterator<Item = [u8], Error = Error>,
 {
-    writer.write_all(&[b'['])?;
+    writer.write_all(b"[")?;
     let mut is_first_row = true;
     while let Some(block) = blocks.next()? {
         if !is_first_row {
-            writer.write_all(&[b','])?;
+            writer.write_all(b",")?;
         }
         is_first_row = false;
         writer.write_all(block)?;
     }
-    writer.write_all(&[b']'])?;
+    writer.write_all(b"]")?;
     Ok(())
 }
diff --git a/src/arrow2/src/io/parquet/read/indexes/mod.rs b/src/arrow2/src/io/parquet/read/indexes/mod.rs
@@ -335,6 +335,7 @@ pub fn compute_page_row_intervals(
 ///   For each field, the outermost vector corresponds to each parquet column:
 ///   a primitive field contains 1 column, a struct field with 2 primitive fields contain 2 columns.
 ///   The inner `Vec<Interval>` contains one [`Interval`] per page: its length equals the length of [`ColumnPageStatistics`].
+///
 /// It returns a single [`Vec<Interval>`] denoting the set of intervals that the predicate selects (over all columns).
 ///
 /// This returns one item per `field`. For each field, there is one item per column (for non-nested types it returns one column)

diff --git a/src/arrow2/src/io/parquet/write/utils.rs b/src/arrow2/src/io/parquet/write/utils.rs
@@ -131,9 +131,8 @@ impl<T, I: Iterator<Item = T>> Iterator for ExactSizedIter<T, I> {
 
     #[inline]
     fn next(&mut self) -> Option<Self::Item> {
-        self.iter.next().map(|x| {
+        self.iter.next().inspect(|_| {
             self.remaining -= 1;
-            x
         })
     }
 

diff --git a/src/arrow2/src/lib.rs b/src/arrow2/src/lib.rs
@@ -12,7 +12,6 @@
 #![allow(clippy::type_complexity)]
 #![cfg_attr(docsrs, feature(doc_cfg))]
 #![cfg_attr(feature = "simd", feature(portable_simd))]
-#![cfg_attr(feature = "nightly_build", feature(build_hasher_simple_hash_one))]
 
 #[macro_use]
 pub mod array;

diff --git a/src/arrow2/src/scalar/fixed_size_list.rs b/src/arrow2/src/scalar/fixed_size_list.rs
@@ -31,10 +31,9 @@ impl FixedSizeListScalar {
     pub fn new(data_type: DataType, values: Option<Box<dyn Array>>) -> Self {
         let (field, size) = FixedSizeListArray::get_child_and_size(&data_type);
         let inner_data_type = field.data_type();
-        let values = values.map(|x| {
+        let values = values.inspect(|x| {
             assert_eq!(inner_data_type, x.data_type());
             assert_eq!(size, x.len());
-            x
         });
         Self { values, data_type }
     }

diff --git a/src/arrow2/src/temporal_conversions.rs b/src/arrow2/src/temporal_conversions.rs
@@ -444,6 +444,7 @@ fn chrono_tz_utf_to_timestamp_ns<O: Offset>(
 /// * parsed values with timezone other than `timezone` are converted to `timezone`.
 /// * parsed values without timezone are null. Use [`utf8_to_naive_timestamp_ns`] to parse naive timezones.
 /// * Null elements remain null; non-parsable elements are null.
+///
 /// The feature `"chrono-tz"` enables IANA and zoneinfo formats for `timezone`.
 /// # Error
 /// This function errors iff `timezone` is not parsable to an offset.

diff --git a/src/common/treenode/src/lib.rs b/src/common/treenode/src/lib.rs
@@ -49,14 +49,14 @@ macro_rules! handle_transform_recursion {
 /// There are three categories of TreeNode APIs:
 ///
 /// 1. "Inspecting" APIs to traverse a tree of `&TreeNodes`:
-/// [`apply`], [`visit`], [`exists`].
+///    [`apply`], [`visit`], [`exists`].
 ///
 /// 2. "Transforming" APIs that traverse and consume a tree of `TreeNode`s
-/// producing possibly changed `TreeNode`s: [`transform`], [`transform_up`],
-/// [`transform_down`], [`transform_down_up`], and [`rewrite`].
+///    producing possibly changed `TreeNode`s: [`transform`], [`transform_up`],
+///    [`transform_down`], [`transform_down_up`], and [`rewrite`].
 ///
 /// 3. Internal APIs used to implement the `TreeNode` API: [`apply_children`],
-/// and [`map_children`].
+///    and [`map_children`].
 ///
 /// | Traversal Order | Inspecting | Transforming |
 /// | --- | --- | --- |

diff --git a/src/daft-core/src/datatypes/dtype.rs b/src/daft-core/src/datatypes/dtype.rs
@@ -55,6 +55,7 @@ pub enum DataType {
     /// * As used in the Olson time zone database (the "tz database" or
     ///   "tzdata"), such as "America/New_York"
     /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+    ///
     /// When the timezone is not specified, the timestamp is considered to have no timezone
     /// and is represented _as is_
     Timestamp(TimeUnit, Option<String>),

diff --git a/src/daft-dsl/src/lit.rs b/src/daft-dsl/src/lit.rs
@@ -54,6 +54,7 @@ pub enum LiteralValue {
     /// * As used in the Olson time zone database (the "tz database" or
     ///   "tzdata"), such as "America/New_York"
     /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+    ///
     /// When the timezone is not specified, the timestamp is considered to have no timezone
     /// and is represented _as is_
     Timestamp(i64, TimeUnit, Option<String>),

diff --git a/src/parquet2/src/read/compression.rs b/src/parquet2/src/read/compression.rs
@@ -141,6 +141,7 @@ fn decompress_reuse<P: PageIterator>(
 /// ### un-compressed pages:
 /// > page iter: `a` is swapped with `b`
 /// > decompress iter: `b` is swapped with `d`, `b` is swapped with `a`
+///
 /// therefore:
 /// * `PageReader` has its buffer back
 /// * `Decompressor`'s buffer is un-used
@@ -152,10 +153,12 @@ fn decompress_reuse<P: PageIterator>(
 /// > * `b` is swapped with `a`
 /// > * `c` is moved to `d`
 /// > * (next iteration): `d` is moved to `c`
+///
 /// therefore, while the page is available:
 /// * `PageReader` has its buffer back
 /// * `Decompressor`'s buffer empty
 /// * `DecompressedPage` has the decompressed buffer
+///
 /// after the page is used:
 /// * `PageReader` has its buffer back
 /// * `Decompressor` has its buffer back

diff --git a/tests/table/numeric/test_numeric.py b/tests/table/numeric/test_numeric.py
@@ -27,6 +27,19 @@
 ]
 
 
+def lists_close_with_nones(a, b):
+    if len(a) != len(b):
+        return False
+    for x, y in zip(a, b):
+        if x is None and y is None:
+            continue
+        if x is not None and y is not None:
+            np.testing.assert_allclose([x], [y])
+        else:
+            return False
+    return True
+
+
 @pytest.mark.parametrize("data_dtype, op", itertools.product(daft_numeric_types, OPS))
 def test_table_numeric_expressions(data_dtype, op) -> None:
     a, b = [5, 6, 7, 8], [1, 2, 3, 4]
@@ -392,12 +405,16 @@ def test_table_log10_bad_input() -> None:
 def test_table_numeric_log(base: float) -> None:
     table = MicroPartition.from_pydict({"a": [0.1, 0.01, 1.5, None], "b": [1, 10, None, None]})
     log_table = table.eval_expression_list([col("a").log(base), col("b").log(base)])
-    assert [
-        math.log(v, base) if v is not None else v for v in table.get_column("a").to_pylist()
-    ] == log_table.get_column("a").to_pylist()
-    assert [
-        math.log(v, base) if v is not None else v for v in table.get_column("b").to_pylist()
-    ] == log_table.get_column("b").to_pylist()
+
+    assert lists_close_with_nones(
+        log_table.get_column("a").to_pylist(),
+        [math.log(v, base) if v is not None else None for v in table.get_column("a").to_pylist()],
+    )
+
+    assert lists_close_with_nones(
+        log_table.get_column("b").to_pylist(),
+        [math.log(v, base) if v is not None else None for v in table.get_column("b").to_pylist()],
+    )
 
 
 def test_table_log_bad_input() -> None:
@@ -410,12 +427,14 @@ def test_table_log_bad_input() -> None:
 def test_table_numeric_ln() -> None:
     table = MicroPartition.from_pydict({"a": [0.1, 0.01, 1.5, None], "b": [1, 10, None, None]})
     ln_table = table.eval_expression_list([col("a").ln(), col("b").ln()])
-    assert [math.log(v) if v is not None else v for v in table.get_column("a").to_pylist()] == ln_table.get_column(
-        "a"
-    ).to_pylist()
-    assert [math.log(v) if v is not None else v for v in table.get_column("b").to_pylist()] == ln_table.get_column(
-        "b"
-    ).to_pylist()
+    assert lists_close_with_nones(
+        [math.log(v) if v is not None else v for v in table.get_column("a").to_pylist()],
+        ln_table.get_column("a").to_pylist(),
+    )
+    assert lists_close_with_nones(
+        [math.log(v) if v is not None else v for v in table.get_column("b").to_pylist()],
+        ln_table.get_column("b").to_pylist(),
+    )
 
 
 def test_table_ln_bad_input() -> None:
@@ -428,8 +447,14 @@ def test_table_ln_bad_input() -> None:
 def test_table_exp() -> None:
     table = MicroPartition.from_pydict({"a": [0.1, 0.01, None], "b": [1, 10, None]})
     exp_table = table.eval_expression_list([col("a").exp(), col("b").exp()])
-    assert [1.1051709180756477, 1.010050167084168, None] == exp_table.get_column("a").to_pylist()
-    assert [2.718281828459045, 22026.465794806718, None] == exp_table.get_column("b").to_pylist()
+    assert lists_close_with_nones(
+        [1.1051709180756477, 1.010050167084168, None],
+        exp_table.get_column("a").to_pylist(),
+    )
+    assert lists_close_with_nones(
+        [2.718281828459045, 22026.465794806718, None],
+        exp_table.get_column("b").to_pylist(),
+    )
 
 
 def test_table_numeric_sqrt() -> None:

diff --git a/tools/check_for_rustls.sh b/tools/check_for_rustls.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-cargo tree --workspace --all-features | grep -vzq rustls
+cargo tree --workspace --all-features | grep -v 'rustls-pemfile' | grep -vzq 'rustls'