Skip to content

Implement arrow::ExtensionType#7

Closed
paleolimbot wants to merge 1 commit intomasterfrom
arrow-ext-type
Closed

Implement arrow::ExtensionType#7
paleolimbot wants to merge 1 commit intomasterfrom
arrow-ext-type

Conversation

@paleolimbot
Copy link
Collaborator

Only barely reproducible, since these are both still PRs, but done to test the motivating example behind the arrow extension type PR.

# remotes::install_github("apache/arrow#12467")
# remotes::install_github("paleolimbot/geoarrow@arrow-ext-type")
library(arrow, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(geoarrow)

places_folder <- system.file("example_dataset/osm_places", package = "geoarrow")
places <- open_dataset(places_folder)
places$schema$geometry$type
#> GeoArrowType
#> point GEOGCS["WGS 84",DATUM["WGS_...
places$schema$geometry$type$crs
#> [1] "GEOGCS[\"WGS 84\",DATUM[\"WGS_1984\",SPHEROID[\"WGS 84\",6378137,298.257223563],AUTHORITY[\"EPSG\",\"6326\"]],PRIMEM[\"Greenwich\",0,AUTHORITY[\"EPSG\",\"8901\"]],UNIT[\"degree\",0.0174532925199433,AUTHORITY[\"EPSG\",\"9122\"]],AXIS[\"Longitude\",EAST],AXIS[\"Latitude\",NORTH]]"

# works!
Scanner$create(places)$ToTable()
#> Table
#> 7255 rows x 6 columns
#> $osm_id <string>
#> $code <int32>
#> $population <double>
#> $name <string>
#> $geometry <point GEOGCS["WGS 84",DATUM["WGS_...>
#> $fclass <string>
#> 
#> See $metadata for additional Schema metadata

# works!
as.data.frame(Scanner$create(places)$ToTable())
#> # A tibble: 7,255 × 6
#>    osm_id      code population name           geometry                    fclass
#>    <chr>      <int>      <dbl> <chr>          <wk_wkb>                    <chr> 
#>  1 21040334    1001      50781 Roskilde       <POINT (12.08192 55.64335)> city  
#>  2 21040360    1001      72398 Esbjerg        <POINT (8.452075 55.46649)> city  
#>  3 26559154    1001      62687 Randers        <POINT (10.03715 56.46175)> city  
#>  4 26559170    1001      60508 Kolding        <POINT (9.47905 55.4895)>   city  
#>  5 26559198    1001      56567 Vejle          <POINT (9.533324 55.70001)> city  
#>  6 26559213    1001     273077 Aarhus         <POINT (10.2134 56.14963)>  city  
#>  7 26559274    1001     178210 Odense         <POINT (10.38521 55.39972)> city  
#>  8 1368129781  1001      58646 Horsens        <POINT (9.844477 55.86117)> city  
#>  9 2247730880  1001     114194 Aalborg        <POINT (9.921526 57.04626)> city  
#> 10 393558713   1030          0 Englebjerggård <POINT (11.77737 55.2004)>  farm  
#> # … with 7,245 more rows

# unfortunately, this fails...
places %>% 
  filter(population > 100000) %>% 
  select(name, population, fclass, geometry) %>% 
  arrange(desc(population)) %>% 
  collect()
#> Error in `handle_csv_read_error()` at r/R/dplyr-collect.R:33:6:
#> ! NotImplemented: concatenation of extension<geoarrow.point>
#> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/array/concatenate.cc:195  VisitTypeInline(*out_->type, this)
#> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/array/concatenate.cc:590  ConcatenateImpl(data, pool).Concatenate(&out_data)
#> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc:2025  Concatenate(values.chunks(), ctx->memory_pool())
#> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/kernels/vector_selection.cc:2084  TakeCA(*table.column(j), indices, options, ctx)
#> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/exec/sink_node.cc:375  impl_->DoFinish()
#> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/compute/exec/exec_plan.cc:484  iterator_.Next()
#> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/record_batch.cc:337  ReadNext(&batch)
#> /Users/deweydunnington/Desktop/rscratch/arrow/cpp/src/arrow/record_batch.cc:351  ToRecordBatches()

# ...unless we unregister the extension type and use geoarrow_collect()
arrow::unregister_extension_type("geoarrow.point")
open_dataset(places_folder) %>% 
  filter(population > 100000) %>% 
  select(name, population, fclass, geometry) %>% 
  arrange(desc(population)) %>% 
  geoarrow_collect()
#> # A tibble: 5 × 4
#>   name          population fclass           geometry                   
#>   <chr>              <dbl> <chr>            <wk_wkb>                   
#> 1 København         613288 national_capital <POINT (12.57007 55.68672)>
#> 2 Aarhus            273077 city             <POINT (10.2134 56.14963)> 
#> 3 Odense            178210 city             <POINT (10.38521 55.39972)>
#> 4 Aalborg           114194 city             <POINT (9.921526 57.04626)>
#> 5 Frederiksberg     102029 suburb           <POINT (12.53262 55.67802)>

Created on 2022-03-29 by the reprex package (v2.0.1)

@paleolimbot
Copy link
Collaborator Author

(Was superceeded by #11)

@paleolimbot paleolimbot deleted the arrow-ext-type branch May 27, 2024 02:07
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant