-
Couldn't load subscription status.
- Fork 374
Closed
Description
Using Julia 1.9.1 and DataFrames 1.5.0, a bounds error is thrown when attempting to sort a dataframe by a column following a select and reassignment operation. This is not reproducible in another column in the same dataframe. Despite scrutinizing the contents of the column throwing the error, no clues to source are found.
[version details in expandable section below code]
Example data to reproduce error is here.
using DataFrames, CSV
const datap = "."
# Bounds error
let
df = CSV.read(joinpath(datap, "examp.csv"), DataFrame)
df = select(df, [:sku, :units])
sort(df, :sku)
end
#Success
let
df = CSV.read(joinpath(datap, "examp.csv"), DataFrame)
df = select(df, [:sku, :units])
sort(df, :units)
end
#Success
let
df = CSV.read(joinpath(datap, "examp.csv"), DataFrame)
df = select(df, [:sku, :units])
sort(df.sku)
end
#Success
let
df = CSV.read(joinpath(datap, "examp.csv"), DataFrame)
sort(df, :sku)
end
# CSV.read error: InexactError: trunc(Int64, NaN)
let
df = CSV.read(joinpath(datap, "examp.csv"), DataFrame; skipto = 8000, limit = 5000)
end
## investigate :sku column ##
function skucategory(sku)
if ismissing(sku)
return missing
elseif occursin(r"^\d+$", sku)
return "digits"
elseif occursin(r"^[\dS]+$", sku)
return "S_sku"
elseif occursin(r"^[[:alnum:]]+$", sku)
return "alphanumeric"
elseif occursin(r"\/", sku)
return "slash"
else
return "other"
end
end
let
df = CSV.read(joinpath(datap, "examp.csv"), DataFrame)
transform!(df, :sku => ByRow(skucategory) => :skucat)
gdf = groupby(df, :skucat)
combine(gdf, nrow)
end
version info, project.toml, error output
julia> versioninfo()
Julia Version 1.9.1
Commit 147bdf428c (2023-06-07 08:27 UTC)
Platform Info:
OS: macOS (arm64-apple-darwin21.6.0)
CPU: 10 × Apple M1 Max
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-14.0.6 (ORCJIT, apple-m1)
Threads: 5 on 8 virtual cores
Environment:
JULIA_NUM_THREADS = 4
JULIA_EDITOR = Emacs
JULIA_PKG_DEVDIR = /Users/george/Documents/julia/dev
Status `~/Desktop/exampl/Project.toml`
[336ed68f] CSV v0.10.11
[a93c6f00] DataFrames v1.5.0 `https://github.com/JuliaData/DataFrames.jl.git#main`
# Bounds error
ERROR: BoundsError: attempt to access 217465-element Vector{Int64} at index [0]
Stacktrace:
[1] setindex!
@ ./array.jl:969 [inlined]
[2] _sort!(v::Vector{Int64}, a::Base.Sort.MissingOptimization{Base.Sort.BoolOptimization{Base.Sort.Small{10, Base.Sort.InsertionSortAlg, Base.Sort.IEEEFloatOptimization{Base.Sort.IsUIntMappable{Base.Sort.Small{40, Base.Sort.InsertionSortAlg, Base.Sort.CheckSorted{Base.Sort.ComputeExtrema{Base.Sort.ConsiderCountingSort{Base.Sort.CountingSort, Base.Sort.ConsiderRadixSort{Base.Sort.RadixSort, Base.Sort.Small{80, Base.Sort.InsertionSortAlg, Base.Sort.ScratchQuickSort{Missing, Missing, Base.Sort.InsertionSortAlg}}}}}}}, Base.Sort.StableCheckSorted{Base.Sort.ScratchQuickSort{Missing, Missing, Base.Sort.InsertionSortAlg}}}}}}}, o::Base.Order.Perm{Base.Order.ForwardOrdering, Vector{Union{Missing, String31}}}, kw::NamedTuple{(:lo, :hi, :allow_legacy_dispatch), Tuple{Int64, Int64, Bool}})
@ Base.Sort ./sort.jl:583
[3] sort!
@ ./sort.jl:2123 [inlined]
[4] sort!(v::Vector{Int64}, lo::Int64, hi::Int64, #unused#::SortingAlgorithms.TimSortAlg, o::Base.Order.Perm{Base.Order.ForwardOrdering, Vector{Union{Missing, String31}}})
@ SortingAlgorithms ~/.julia/packages/SortingAlgorithms/n1AWW/src/SortingAlgorithms.jl:488
[5] sort!
@ ./sort.jl:2121 [inlined]
[6] _sortperm(df::DataFrame, a::SortingAlgorithms.TimSortAlg, o::Base.Order.Perm{Base.Order.ForwardOrdering, Vector{Union{Missing, String31}}})
@ DataFrames ~/.julia/packages/DataFrames/4GaAs/src/abstractdataframe/sort.jl:592
[7] sortperm(df::DataFrame, cols::Symbol; alg::Nothing, lt::typeof(isless), by::typeof(identity), rev::Bool, order::Base.Order.ForwardOrdering)
@ DataFrames ~/.julia/packages/DataFrames/4GaAs/src/abstractdataframe/sort.jl:589
[8] sortperm
@ ~/.julia/packages/DataFrames/4GaAs/src/abstractdataframe/sort.jl:577 [inlined]
[9] #sort#378
@ ~/.julia/packages/DataFrames/4GaAs/src/abstractdataframe/sort.jl:510 [inlined]
[10] sort(df::DataFrame, cols::Symbol)
@ DataFrames ~/.julia/packages/DataFrames/4GaAs/src/abstractdataframe/sort.jl:503
[11] top-level scope
@ REPL[5]:4
# CSV inexact error
ERROR: InexactError: trunc(Int64, NaN)
Stacktrace:
[1] trunc
@ ./float.jl:893 [inlined]
[2] ceil(#unused#::Type{Int64}, x::Float64)
@ Base ./float.jl:383
[3] CSV.Context(source::CSV.Arg, header::CSV.Arg, normalizenames::CSV.Arg, datarow::CSV.Arg, skipto::CSV.Arg, footerskip::CSV.Arg, transpose::CSV.Arg, comment::CSV.Arg, ignoreemptyrows::CSV.Arg, ignoreemptylines::CSV.Arg, select::CSV.Arg, drop::CSV.Arg, limit::CSV.Arg, buffer_in_memory::CSV.Arg, threaded::CSV.Arg, ntasks::CSV.Arg, tasks::CSV.Arg, rows_to_check::CSV.Arg, lines_to_check::CSV.Arg, missingstrings::CSV.Arg, missingstring::CSV.Arg, delim::CSV.Arg, ignorerepeated::CSV.Arg, quoted::CSV.Arg, quotechar::CSV.Arg, openquotechar::CSV.Arg, closequotechar::CSV.Arg, escapechar::CSV.Arg, dateformat::CSV.Arg, dateformats::CSV.Arg, decimal::CSV.Arg, groupmark::CSV.Arg, truestrings::CSV.Arg, falsestrings::CSV.Arg, stripwhitespace::CSV.Arg, type::CSV.Arg, types::CSV.Arg, typemap::CSV.Arg, pool::CSV.Arg, downcast::CSV.Arg, lazystrings::CSV.Arg, stringtype::CSV.Arg, strict::CSV.Arg, silencewarnings::CSV.Arg, maxwarnings::CSV.Arg, debug::CSV.Arg, parsingdebug::CSV.Arg, validate::CSV.Arg, streaming::CSV.Arg)
@ CSV ~/.julia/packages/CSV/OnldF/src/context.jl:647
[4] #File#32
@ ~/.julia/packages/CSV/OnldF/src/file.jl:222 [inlined]
[5] read(source::String, sink::Type; copycols::Bool, kwargs::Base.Pairs{Symbol, Int64, Tuple{Symbol, Symbol}, NamedTuple{(:skipto, :limit), Tuple{Int64, Int64}}})
@ CSV ~/.julia/packages/CSV/OnldF/src/CSV.jl:117
[6] top-level scope
@ REPL[6]:2