Calling `first(some_set)` gets me "ArgumentError: collection must be non-empty", even though I checked the Set for emptiness immediately before

This is Julia 1.6.1. I hit an error that seems really weird right now. The code’s basically like this:

loop
         if isempty(some_set)
                 # Change some_set
                 # ...

                 continue
         end

         local a = first(some_set)
         delete!(some_set, a)

         # ...
end

In the first call, I get this error: “ArgumentError: collection must be non-empty”, even though I check for that condition in the if immediately above.

This is the full code, if anybody is willing to take a look. There’s only one call to first:

# Copyright © 2021: Neven Sajko
#
# Main input: arguments of function 'search'
#	* A file name
#		* The file must be a connected partitioned word list.
#		* The file name must end in this form:
#		  list_XXXXXX_YYYYYY.txt
#		* XXXXXX is the count of word classes in the list.
#		* YYYYYY is just some number unique to the list.
#	* Two natural numbers M and N. M must be less than N. The
#	  numbers define the range M:N, from which the roots of the
#	  search are taken. E.g., if M is 2 and N is 3: the roots are
#	  2 and 3, i.e., the second word class in the file and the
#	  third word class in the file.
#	* A natural number: the minimal length that a Kaladont sequence
#	  has to have for us to be interested in it.
#
# Main output: ./data/abstract_kaladont_sequences/seq_XXXXXX_YYYYYY_ZZZZZ_TTTT.txt
#	* ZZZZZ is the count of words in the sequence.
#	* TTTT is the thread ID of the thread which found the sequence.
#	  This just serves so I wouldn't have to synchronize the
#	  threads.
#
# Additional I/O: ./data/sequence_root_explored/done_XXXXXX_YYYYYY_CCCCCC
#	* All of the files in the directory sequence_root_explored are
#	  read and need to have that naming scheme.
#	* CCCCCC denotes the class of words within the list which was
#	  already fully explored as a root for the search.
#	* As output, this is a sort of progress report.
#	* As input, this allows restarting the program without needing
#	  to unnecessarily explore options which were explored already
#	  in previous runs.
#	* Note: these are just empty files, only their names are
#	  important.
#
# Additional input: ./data/exit_when_done_with_root
#	* If this file is found to exist during this program's
#	  execution, the program exits after finishing with the current
#	  roots of the search. (There can be multiple current roots
#	  because of multiple parallel threads of execution each
#	  starting from a distinct root.)

module FindAbstractKaladontSequences

using Base.Threads, Printf, StaticArrays

export search

const TwoChars = Tuple{UInt8, UInt8}

# A two-character prefix and a two-character suffix.
const PreSuf = Tuple{TwoChars, TwoChars}

struct Graph
	# The count of words in each word class.
	presuf_to_class_size :: Dict{PreSuf, Int32}

	# The index of each word class.
	presuf_to_index :: Dict{PreSuf, Int16}

	# The word class that correspond to an index.
	index_to_presuf :: Vector{PreSuf}

	# A sort of adjacency list.
	#
	# Maps a prefix to all word classes which have that prefix.
	prefix_to_presuf :: SMatrix{27, 27, Set{PreSuf}, 27^2}
end

function encodedCharacter(c)
	local plain = UInt8(c)
	(plain == UInt8('-')) && (return UInt8(27))
	UInt8(plain - UInt8('a') + 1)
end

function decodedCharacter(encoded)
	(encoded == UInt8(27)) && (return UInt8('-'))
	UInt8(encoded + UInt8('a') - 1)
end

presuf(s::String) = (encodedCharacter.((s[1], s[2])),
                     encodedCharacter.((s[length(s) - 1], s[length(s)])))

function presuf_for_humans(ps::PreSuf)
	local d = (Char.(decodedCharacter.(ps[1])), Char.(decodedCharacter.(ps[2])))
	string(d[1][1], d[1][2], d[2][1], d[2][2])
end

function load(input_file::String)
	local presuf_to_class_size = Dict{PreSuf, Int32}()
	local presuf_to_index = Dict{PreSuf, Int16}()
	local index_to_presuf = PreSuf[]
	local prefix_to_presuf = @MMatrix[Set{PreSuf}() for _ in 1:27, _ in 1:27]

	for line in eachline(input_file)
		(length(line) == 0) && continue

		local ps = presuf(line)

		if !haskey(presuf_to_class_size, ps)
			presuf_to_class_size[ps] = 0

			push!(index_to_presuf, ps)
			presuf_to_index[ps] = length(index_to_presuf)
		end

		presuf_to_class_size[ps] += 1
		push!(prefix_to_presuf[ps[1][1], ps[1][2]], ps)
	end

	Graph(presuf_to_class_size,
	      presuf_to_index,
	      index_to_presuf,
	      SMatrix(prefix_to_presuf))
end

function report_seq(prefix::String, g::Graph,
                    walk::Vector{NamedTuple{(:walk_node, :need_traversing),
                                            Tuple{PreSuf, Set{PreSuf}}}})
	local file_name = string(prefix, Printf.@sprintf("%05d", length(walk)), "_",
	                         Printf.@sprintf("%04d", Threads.threadid()))

	Base.Filesystem.ispath(file_name) && (return nothing)

	local s = ""
	for node in walk
		local ps = node.walk_node
		s = string(s, Printf.@sprintf("%05d", g.presuf_to_index[ps]), " ",
		           presuf_for_humans(ps), "\n")
	end

	local f = open(file_name, lock = false, write = true, create = true)
	write(f, s)
	close(f)

	return nothing
end

function update_max_visited(vert::PreSuf,
                            max_visited::Set{PreSuf},
                            visited::Dict{PreSuf, Int32},
                            g::Graph)
	(visited[vert] == g.presuf_to_class_size[vert]) && push!(max_visited, vert)

	return nothing
end

function search(input_file::String, m::Int, n::Int, minimal_length::Int)
	# Check the range.
	if !(1 <= m && m <= n)
		print("invalid range\n")
		return nothing
	end

	local exit_signal_file = "./data/exit_when_done_with_root"

	# Check if exit_signal_file exists.
	if Base.Filesystem.ispath(exit_signal_file)
		print(string("delete ", exit_signal_file, " before starting the search\n"))
		return nothing
	end

	# Load graph from input file.
	local g = load(input_file)

	# Check the range's upper bound.
	if (length(g.index_to_presuf) < n)
		print("range out of bounds\n")
		return nothing
	end

	local io_file_name_code = input_file[(length(input_file) - 16) : (length(input_file) - 4)]
	local seq_prefix = string("./data/abstract_kaladont_sequences/seq_",
	                          io_file_name_code, "_")
	local done_prefix = string("./data/sequence_root_explored/done_",
	                           io_file_name_code, "_")

	Threads.@threads for search_root in m:n
		local done_signal_file = string(done_prefix, Printf.@sprintf("%06d", search_root))

		# Check if the part of the search with this root is
		# already done. If so, skip to the next root.
		Base.Filesystem.ispath(done_signal_file) && continue

		# The root vertex (word class).
		local root_vert = g.index_to_presuf[search_root]

		# The vertices of a walk through the graph.
		local walk_stack = [(walk_node = root_vert,
		                     need_traversing = g.prefix_to_presuf[root_vert[2][1],
		                                                          root_vert[2][2]])]

		# How many times was each vertex (word class) visited.
		local visited = Dict{PreSuf, Int32}(root_vert => 1)

		# The vertices that can't be visited again, i.e., those
		# which are not potential successors.
		local max_visited = Set{PreSuf}()
		update_max_visited(root_vert, max_visited, visited, g)

		# Search.
		while !isempty(walk_stack)
			if isempty(last(walk_stack).need_traversing)
				# We reached a dead end, backtrack!
				local v = pop!(walk_stack).walk_node
				visited[v] -= 1
				(visited[v] == 0) && delete!(visited, v)
				delete!(max_visited, v)
				continue
			end

			# Pick a successor node.
			local suc = first(last(walk_stack).need_traversing)
			delete!(last(walk_stack).need_traversing, suc)

			haskey(visited, suc) || (visited[suc] = 0)
			visited[suc] += 1
			update_max_visited(suc, max_visited, visited, g)

			local need_traversing = setdiff(g.prefix_to_presuf[suc[2][1],
			                                                   suc[2][2]],
			                                max_visited)
			push!(walk_stack, (walk_node = suc,
			                   need_traversing = need_traversing))

			# Record the walk if it's long enough.
			(minimal_length <= length(walk_stack)) &&
			  report_seq(seq_prefix, g, walk_stack)
		end

		# Signal that we finished the part of the search that
		# starts from this root.
		Base.Filesystem.touch(done_signal_file)

		# If an exit was requested, kill this thread.
		Base.Filesystem.ispath(exit_signal_file) && break
	end

	return nothing
end

end  # module

In case somebody wants to run it, it requires a file like this as input in ./data/connected_partitioned_word_lists/list_000043_000001.txt:

kabac

aca

caano

inacica

kiabudin

inace

ce-ov

ce

ice

icanovic

naakovic

ana

Looks liks a data race to me. You load your data into a shared graph, push nodes from that graph onto a (thread local) working stack but you don’t make sure that only one reference to any given node exists in all working stacks. So at some point, some thread is preempted and another thread manipulates the object/Set the first thread required exclusive access to.

If you remove the Threads.@threads, it should work as designed.

2 Likes

A data race indeed. The underlying issue is that I copied references to the Sets instead of copying the Sets themselves.