# Print duplicate questions.
#
# Looks in both queue and learned elements.
# For the first duplicate found, prints the following fields, separated by colons:
# 1.  Category
# 2.  The first occurence, in full
# 3.  The second occurence, in full
# 4.  The `reduced' string, used for matching/comparison
#
# The `reduced' string shows why the script thought two items were `equal'.
# You can adjust the the first line of routine check() to taste, if you don't
# like the `fuzzy matching' algorithm.
#
# Comments etc. to michael at the domain of landcroft dot co dot united kingdom.

require "rexml/document"
require "rexml/streamlistener"

$category = [ ]

$words = { }

def check(catelist, text_item)
	category = catelist.join("/")
	reduced = text_item.gsub(/\b[A-Za-z0-9]\b/u, "").gsub(/\s\s/, " ").sub(/^\s/, "").sub(/\s$/, "")
	reduced = text_item
	if not $words.key?(category) then
		$words[category] = { "" => nil }	# don't show null
	end
	if not $words[category].key?(reduced) then
		$words[category][reduced] = text_item
	elsif $words[category][reduced] != nil
		print category, ":", $words[category][reduced], ":", text_item, ":", reduced, ":\n"
		$words[category][reduced] = nil	# only show it once
	end
end


class DataReader

	include REXML::StreamListener

	def tag_start(name, attributes)
		@tag_stack.push name
		case name
		when "fullrecall"
			if attributes["core_version"].to_i != 12 then
				warn "Not version 12;  results not guaranteed"
			end
		when "category"
			@category << attributes["name"]
		when "q", "a"
			@text_item = ""
		when "item"
			# analyse attributes to determine WoN this item be `difficult'
			@item_attrs = attributes
		else
			warn "Unexpected tag <" + name + ">"
		end
	end


	def text(wording)
		if @tag_stack[-1] =~ /qabu/ and ($category == [ ] or @category == $category) then
			@text_item += wording
		end
	end

	def cdata(wording)
		text(wording.sub(/<img>.*<\/img>/, "").sub(/<snd>.*<\/snd>/, "").gsub(/<[bus]>/, "").gsub(/<\/[bus]>/, ""))
	end

	def tag_end(name)
		case name
		when "category"
			@category.pop
		when "q"
			if $category == [ ] or @category == $category then
				check(@category, @text_item)
			end
		else
		end
	end

	def initialize(*argument)
		@item_attrs = { }
		@tag_stack = [ ]
		@category = [ ]
	end
end

class QueueReader

	include REXML::StreamListener

	def tag_start(name, attributes)
		@level += 1
		# print "<", name, ":", @level, ">"
		@tag_stack.push name
		case name
		when "category"
			@category.push(attributes["name"])
		when "b", "u", "img", "snd", "span"
			# nothing
		when "q", "a"
			@text_item = ""
		else
			warn "Unexpected tag <" + name + ">"
		end
	end


	def text(wording)
		# print wording
		if @tag_stack[-1] =~ /^[qabu]$/ then
			@text_item += wording
		end
	end

	def tag_end(name)
		# print "</", @level, ":", name, ">\n"
		@level -= 1
		if @tag_stack.pop != name then
			warn "Tag \"", name, "\n not matched\n"
		end
		case name
		when "category"
			@category.pop
		when "q"
			if $category == [ ] or @category == $category then
				check(@category, @text_item)
			end
		else
		end
	end

	def initialize(*argument)
		@level = 0
		@tag_stack = [ ]
		@category = [ ]
	end
end

while ARGV.length > 0 do
	case ARGV.shift
	when '-c'
		$category = [ ARGV.shift ]
	end
end

# queue.txt is not `real' XML but we still use the XML stream reader
# code to parse it.
queue = File.new("queue.txt", "r")
while line = queue.gets and line !~ /^This file should/ do
end
REXML::Parsers::StreamParser.new(queue, QueueReader.new).parse

learnt = File.new("elements.xml", "r")
REXML::Parsers::StreamParser.new(learnt, DataReader.new).parse
