Skip to content

Bounds error when sorting a column after select #3340

@George9000

Description

@George9000

Using Julia 1.9.1 and DataFrames 1.5.0, a bounds error is thrown when attempting to sort a dataframe by a column following a select and reassignment operation. This is not reproducible in another column in the same dataframe. Despite scrutinizing the contents of the column throwing the error, no clues to source are found.
[version details in expandable section below code]

Example data to reproduce error is here.

using DataFrames, CSV

const datap = "."

# Bounds error
let
    df = CSV.read(joinpath(datap, "examp.csv"), DataFrame)
    df = select(df, [:sku, :units])
    sort(df, :sku)
end


#Success
let
    df = CSV.read(joinpath(datap, "examp.csv"), DataFrame)
    df = select(df, [:sku, :units])
    sort(df, :units)
end


#Success
let
    df = CSV.read(joinpath(datap, "examp.csv"), DataFrame)
    df = select(df, [:sku, :units])
    sort(df.sku)
end


#Success
let
    df = CSV.read(joinpath(datap, "examp.csv"), DataFrame)
    sort(df, :sku)
end



# CSV.read error: InexactError: trunc(Int64, NaN)
let
    df = CSV.read(joinpath(datap, "examp.csv"), DataFrame; skipto = 8000, limit = 5000)
end


##  investigate :sku column ##
function skucategory(sku)
    if ismissing(sku)
        return missing
    elseif occursin(r"^\d+$", sku)
        return "digits"
    elseif occursin(r"^[\dS]+$", sku)
        return "S_sku"
    elseif occursin(r"^[[:alnum:]]+$", sku)
        return "alphanumeric"
    elseif occursin(r"\/", sku)
        return "slash"
    else
        return "other"
    end
end

let
    df = CSV.read(joinpath(datap, "examp.csv"), DataFrame)
    transform!(df, :sku => ByRow(skucategory) => :skucat)
    gdf = groupby(df, :skucat)
    combine(gdf, nrow)
end   
version info, project.toml, error output
julia> versioninfo()
Julia Version 1.9.1
Commit 147bdf428c (2023-06-07 08:27 UTC)
Platform Info:
  OS: macOS (arm64-apple-darwin21.6.0)
  CPU: 10 × Apple M1 Max
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-14.0.6 (ORCJIT, apple-m1)
  Threads: 5 on 8 virtual cores
Environment:
  JULIA_NUM_THREADS = 4
  JULIA_EDITOR = Emacs
  JULIA_PKG_DEVDIR = /Users/george/Documents/julia/dev
  
Status `~/Desktop/exampl/Project.toml`
  [336ed68f] CSV v0.10.11
  [a93c6f00] DataFrames v1.5.0 `https://github.com/JuliaData/DataFrames.jl.git#main`

# Bounds error
ERROR: BoundsError: attempt to access 217465-element Vector{Int64} at index [0]
Stacktrace:
  [1] setindex!
    @ ./array.jl:969 [inlined]
  [2] _sort!(v::Vector{Int64}, a::Base.Sort.MissingOptimization{Base.Sort.BoolOptimization{Base.Sort.Small{10, Base.Sort.InsertionSortAlg, Base.Sort.IEEEFloatOptimization{Base.Sort.IsUIntMappable{Base.Sort.Small{40, Base.Sort.InsertionSortAlg, Base.Sort.CheckSorted{Base.Sort.ComputeExtrema{Base.Sort.ConsiderCountingSort{Base.Sort.CountingSort, Base.Sort.ConsiderRadixSort{Base.Sort.RadixSort, Base.Sort.Small{80, Base.Sort.InsertionSortAlg, Base.Sort.ScratchQuickSort{Missing, Missing, Base.Sort.InsertionSortAlg}}}}}}}, Base.Sort.StableCheckSorted{Base.Sort.ScratchQuickSort{Missing, Missing, Base.Sort.InsertionSortAlg}}}}}}}, o::Base.Order.Perm{Base.Order.ForwardOrdering, Vector{Union{Missing, String31}}}, kw::NamedTuple{(:lo, :hi, :allow_legacy_dispatch), Tuple{Int64, Int64, Bool}})
    @ Base.Sort ./sort.jl:583
  [3] sort!
    @ ./sort.jl:2123 [inlined]
  [4] sort!(v::Vector{Int64}, lo::Int64, hi::Int64, #unused#::SortingAlgorithms.TimSortAlg, o::Base.Order.Perm{Base.Order.ForwardOrdering, Vector{Union{Missing, String31}}})
    @ SortingAlgorithms ~/.julia/packages/SortingAlgorithms/n1AWW/src/SortingAlgorithms.jl:488
  [5] sort!
    @ ./sort.jl:2121 [inlined]
  [6] _sortperm(df::DataFrame, a::SortingAlgorithms.TimSortAlg, o::Base.Order.Perm{Base.Order.ForwardOrdering, Vector{Union{Missing, String31}}})
    @ DataFrames ~/.julia/packages/DataFrames/4GaAs/src/abstractdataframe/sort.jl:592
  [7] sortperm(df::DataFrame, cols::Symbol; alg::Nothing, lt::typeof(isless), by::typeof(identity), rev::Bool, order::Base.Order.ForwardOrdering)
    @ DataFrames ~/.julia/packages/DataFrames/4GaAs/src/abstractdataframe/sort.jl:589
  [8] sortperm
    @ ~/.julia/packages/DataFrames/4GaAs/src/abstractdataframe/sort.jl:577 [inlined]
  [9] #sort#378
    @ ~/.julia/packages/DataFrames/4GaAs/src/abstractdataframe/sort.jl:510 [inlined]
 [10] sort(df::DataFrame, cols::Symbol)
    @ DataFrames ~/.julia/packages/DataFrames/4GaAs/src/abstractdataframe/sort.jl:503
 [11] top-level scope
    @ REPL[5]:4


# CSV inexact error
ERROR: InexactError: trunc(Int64, NaN)
Stacktrace:
 [1] trunc
   @ ./float.jl:893 [inlined]
 [2] ceil(#unused#::Type{Int64}, x::Float64)
   @ Base ./float.jl:383
 [3] CSV.Context(source::CSV.Arg, header::CSV.Arg, normalizenames::CSV.Arg, datarow::CSV.Arg, skipto::CSV.Arg, footerskip::CSV.Arg, transpose::CSV.Arg, comment::CSV.Arg, ignoreemptyrows::CSV.Arg, ignoreemptylines::CSV.Arg, select::CSV.Arg, drop::CSV.Arg, limit::CSV.Arg, buffer_in_memory::CSV.Arg, threaded::CSV.Arg, ntasks::CSV.Arg, tasks::CSV.Arg, rows_to_check::CSV.Arg, lines_to_check::CSV.Arg, missingstrings::CSV.Arg, missingstring::CSV.Arg, delim::CSV.Arg, ignorerepeated::CSV.Arg, quoted::CSV.Arg, quotechar::CSV.Arg, openquotechar::CSV.Arg, closequotechar::CSV.Arg, escapechar::CSV.Arg, dateformat::CSV.Arg, dateformats::CSV.Arg, decimal::CSV.Arg, groupmark::CSV.Arg, truestrings::CSV.Arg, falsestrings::CSV.Arg, stripwhitespace::CSV.Arg, type::CSV.Arg, types::CSV.Arg, typemap::CSV.Arg, pool::CSV.Arg, downcast::CSV.Arg, lazystrings::CSV.Arg, stringtype::CSV.Arg, strict::CSV.Arg, silencewarnings::CSV.Arg, maxwarnings::CSV.Arg, debug::CSV.Arg, parsingdebug::CSV.Arg, validate::CSV.Arg, streaming::CSV.Arg)
   @ CSV ~/.julia/packages/CSV/OnldF/src/context.jl:647
 [4] #File#32
   @ ~/.julia/packages/CSV/OnldF/src/file.jl:222 [inlined]
 [5] read(source::String, sink::Type; copycols::Bool, kwargs::Base.Pairs{Symbol, Int64, Tuple{Symbol, Symbol}, NamedTuple{(:skipto, :limit), Tuple{Int64, Int64}}})
   @ CSV ~/.julia/packages/CSV/OnldF/src/CSV.jl:117
 [6] top-level scope
   @ REPL[6]:2

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions