Skip to content

Commit 62bb614

Browse files
authored
Consider nullability in arrow_type (#63)
1 parent fa7a30e commit 62bb614

File tree

2 files changed

+94
-9
lines changed

2 files changed

+94
-9
lines changed

src/schema.jl

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -259,18 +259,29 @@ end
259259
260260
Get the Arrow/Julia type that should be used for data corresponding to this field.
261261
262-
This is a convenience method that calls `iceberg_type_to_arrow_type()` with the field's type.
262+
This function returns the Arrow/Julia type for the field's type, and accounts for nullability:
263+
- If the field is required, returns the type as-is
264+
- If the field is not required (nullable), returns `Union{Missing, T}` where T is the base type
263265
264266
# Example
265267
266268
```julia
267-
field = Field(Int32(1), "event_date", "date"; required=true)
268-
arrow_t = arrow_type(field)
269+
field_required = Field(Int32(1), "event_date", "date"; required=true)
270+
arrow_t = arrow_type(field_required)
269271
# Returns Date type - users should provide Date objects
272+
273+
field_nullable = Field(Int32(2), "description", "string"; required=false)
274+
arrow_t = arrow_type(field_nullable)
275+
# Returns Union{Missing, String} - users can provide String or missing values
270276
```
271277
"""
272278
function arrow_type(field::Field)
273-
iceberg_type_to_arrow_type(field.type)
279+
base_type = iceberg_type_to_arrow_type(field.type)
280+
if field.required
281+
return base_type
282+
else
283+
return Union{Missing, base_type}
284+
end
274285
end
275286

276287
"""
@@ -310,22 +321,22 @@ end
310321
Get a dictionary mapping field names to their Arrow/Julia types for the schema.
311322
312323
This helps users understand what data types they need to provide when writing to
313-
an Iceberg table with this schema.
324+
an Iceberg table with this schema. For nullable fields, the type will be `Union{Missing, T}`.
314325
315326
# Example
316327
317328
```julia
318329
schema = Schema([
319330
Field(Int32(1), "id", "long"; required=true),
320-
Field(Int32(2), "event_date", "date"),
321-
Field(Int32(3), "event_time", "timestamp"),
331+
Field(Int32(2), "event_date", "date"; required=false),
332+
Field(Int32(3), "event_time", "timestamp"; required=true),
322333
])
323334
324335
types = arrow_types(schema)
325336
# Returns Dict{String, Type}:
326337
# "id" => Int64
327-
# "event_date" => Date
328-
# "event_time" => Int64
338+
# "event_date" => Union{Missing, Date}
339+
# "event_time" => Arrow.Timestamp{Arrow.Flatbuf.TimeUnit.MICROSECOND, nothing}
329340
```
330341
"""
331342
function arrow_types(schema::Schema)::Dict{String, Type}

test/schema_tests.jl

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ using RustyIceberg: SortDirection, ASC, DESC
44
using RustyIceberg: NullOrder, NULLS_FIRST, NULLS_LAST
55
using Test
66
using JSON
7+
using Arrow
8+
using Dates
79

810
@testset "Schema Types" begin
911
@testset "Field Creation" begin
@@ -256,4 +258,76 @@ end
256258
end
257259
end
258260

261+
@testset "Arrow Type Mappings" begin
262+
@testset "iceberg_type_to_arrow_type" begin
263+
# Basic types
264+
@test iceberg_type_to_arrow_type("boolean") == Bool
265+
@test iceberg_type_to_arrow_type("int") == Int32
266+
@test iceberg_type_to_arrow_type("long") == Int64
267+
@test iceberg_type_to_arrow_type("float") == Float32
268+
@test iceberg_type_to_arrow_type("double") == Float64
269+
@test iceberg_type_to_arrow_type("string") == String
270+
271+
# Temporal types
272+
@test iceberg_type_to_arrow_type("date") == Dates.Date
273+
@test iceberg_type_to_arrow_type("time") == Int64
274+
@test iceberg_type_to_arrow_type("timestamp") == Arrow.Timestamp{Arrow.Flatbuf.TimeUnit.MICROSECOND, nothing}
275+
@test iceberg_type_to_arrow_type("timestamptz") == Arrow.Timestamp{Arrow.Flatbuf.TimeUnit.MICROSECOND, :UTC}
276+
@test iceberg_type_to_arrow_type("timestamp_ns") == Arrow.Timestamp{Arrow.Flatbuf.TimeUnit.NANOSECOND, nothing}
277+
@test iceberg_type_to_arrow_type("timestamptz_ns") == Arrow.Timestamp{Arrow.Flatbuf.TimeUnit.NANOSECOND, :UTC}
278+
279+
# Complex types
280+
@test iceberg_type_to_arrow_type("uuid") == NTuple{16, UInt8}
281+
@test iceberg_type_to_arrow_type("binary") == Vector{UInt8}
282+
283+
# Decimal types with different precisions
284+
@test iceberg_type_to_arrow_type("decimal(5,2)") == Int32
285+
@test iceberg_type_to_arrow_type("decimal(9,0)") == Int32
286+
@test iceberg_type_to_arrow_type("decimal(15,4)") == Int64
287+
@test iceberg_type_to_arrow_type("decimal(18,6)") == Int64
288+
@test iceberg_type_to_arrow_type("decimal(25,10)") == NTuple{16, UInt8}
289+
@test iceberg_type_to_arrow_type("decimal(38,18)") == NTuple{16, UInt8}
290+
end
291+
292+
@testset "arrow_type with nullable fields" begin
293+
# Required field should return base type
294+
field_required = Field(Int32(1), "id", "long"; required=true)
295+
@test arrow_type(field_required) == Int64
296+
297+
# Nullable field should return Union{Missing, T}
298+
field_nullable = Field(Int32(2), "name", "string"; required=false)
299+
@test arrow_type(field_nullable) == Union{Missing, String}
300+
301+
# Test with date field
302+
field_date_required = Field(Int32(3), "birth_date", "date"; required=true)
303+
@test arrow_type(field_date_required) == Dates.Date
304+
305+
field_date_nullable = Field(Int32(4), "death_date", "date"; required=false)
306+
@test arrow_type(field_date_nullable) == Union{Missing, Dates.Date}
307+
308+
# Test with timestamp field
309+
field_ts_nullable = Field(Int32(5), "created_at", "timestamp"; required=false)
310+
@test arrow_type(field_ts_nullable) == Union{Missing, Arrow.Timestamp{Arrow.Flatbuf.TimeUnit.MICROSECOND, nothing}}
311+
end
312+
313+
@testset "arrow_types for schema" begin
314+
schema = Schema([
315+
Field(Int32(1), "id", "long"; required=true),
316+
Field(Int32(2), "name", "string"; required=false),
317+
Field(Int32(3), "event_date", "date"; required=false),
318+
Field(Int32(4), "created_at", "timestamp"; required=true),
319+
])
320+
321+
types = arrow_types(schema)
322+
323+
# Required fields should have base types
324+
@test types["id"] == Int64
325+
@test types["created_at"] == Arrow.Timestamp{Arrow.Flatbuf.TimeUnit.MICROSECOND, nothing}
326+
327+
# Nullable fields should have Union{Missing, T}
328+
@test types["name"] == Union{Missing, String}
329+
@test types["event_date"] == Union{Missing, Dates.Date}
330+
end
331+
end
332+
259333
println("All schema tests passed!")

0 commit comments

Comments
 (0)