Document Actions

DB_database.stl

by Paul McJones — last modified 2021-02-25 11:33
The main big strings and database implementation, with test.
--files: string_utility_pak.stl,sort_pak.stl,get_lines_pak.stl,random_pak.stl
--			disk_records_refct.stl, B_tree_for_bigs_refct.stl, B_tree_for_bigs_wdix.stl, B_tree_for_bigs_wdoc.stl,
--			B_tree_for_bigs_dbix.stl, Big_stg_for_wdoc.stl
--file database_top.stl  test_bs
package big_string_pak;

const bs_code_pts := {"copy", "simple", "optimized", "chop", "tree", "concat_join", "concat_balance", "concat_chop", 	
	"nocon_opt", "nocon", "nocon_left", "nocon_right", "nocon_iter", "nocon_del", "nocon_ins", 	"nocon_to_simp"};
			-- code points to be traversed

	procedure bs_from_stg(stg);				-- create a big string from a string
	procedure stg_from_bigstg(rec);			-- make a string from a big_string
	procedure bs_slice(rec,i,j);			-- the slice extraction operation
	procedure bs_length(rec);				-- length of a bigstring
	procedure bs_set_slice(rw rec,i,j,stg);	-- the slice assignment operation
												
												-- (PUBLIC FOR DEBUGGING ONLY)
	procedure stg_past(j,rec);				-- make a string from a the part of a big_string from j onward
	procedure stg_till(j,rec);				-- make a string from a the part of a big_string up to j
	procedure chop_up(first,mid,last);		-- chop the concatenation of three strings into a tuple of string records
	procedure print_raw(); 					-- raw print for debugging
	procedure bnr_voc(rec,j);				-- j'th member of vector of children for B_tree_for_bigstring records
	procedure bs_check_leaves(rec);			-- leaf consistency check

end big_string_pak;

package body big_string_pak;

use setldb,byteutil,disk_records_pak,db_records,B_tree_for_bigstring,string_utility_pak;

procedure bs_from_stg(stg);					-- make a big_string from a string

												-- we first chop up the string into a tuple of string records
	if #(pieces_tup := chop_up("",stg,"")) = 1 then return make_record(pieces_tup(1)); end if;
						-- see comment on the 'chop_up' routine, below. If the whole string fits in 
						-- just one string record, we simply return that record.
--print("#pieces_tup: ",#pieces_tup);
--print("Total len ",+/[#y:y in pieces_tup]);

	return bnr_make_from_tuple(pieces_tup);		-- otherwise convert the list of sections returned into a tree

end bs_from_stg;

procedure stg_from_bigstg(rec);					-- make a string from a big_string

	if dr_load(rec)(type_byte) = string_record then return sr_slice(rec,1,sr_length(rec)); end if;
		-- get string from record

	if not dr_is_compound(rec) then 			-- concatenate strings from records
		return  "" +/ [sr_slice(ch,1,sr_length(ch)): j in [1..num_childr(rec)] | (ch := bnr_voc(rec,j)) /= OM];
	end if;

	return "" +/ [stg_from_bigstg(bnr_voc(rec,j)): j in [1..num_childr(rec)]];	-- else proceed recursively
	
end stg_from_bigstg;

procedure stg_past(j,rec);					-- make a string from a the part of a big_string from j onward

	if dr_load(rec)(type_byte) = string_record then			-- get string from record

		if j > (sl := sr_length(rec)) then 
			abort("Illegal character index in simple stg_past operation: " + str(j) + ", " + sl);
		end if;

		return sr_slice(rec,j,sl); 		-- return slice from j onward 

	end if;
	
							-- get the child containing the first character past j 
	if not (exists chix in [1..nc := num_childr(rec)] | bnr_get_ch_cum(rec,chix) >= j) then 
		abort("Illegal character index in stg_past operation: " + str(j) + ", " + nc);
	end if;
	
	prev_cum := if chix = 1 then 0 else bnr_get_ch_cum(rec,chix - 1) end if;	-- the previous cumulant

	return stg_past(j - prev_cum,bnr_voc(rec,chix)) 
							+/ [stg_from_bigstg(bnr_voc(rec,j)): j in [chix + 1..nc]];
	
end stg_past;

procedure stg_till(j,rec);					-- make a string from a the part of a big_string up to j
		
	if dr_load(rec)(type_byte) = string_record then		-- simple string record case

		if j > (sl := sr_length(rec)) then 
			abort("Illegal character index in simple stg_past operation: " + str(j) + ", " + sl);
		end if;
		
		return sr_slice(rec,1,j); 			-- return slice up to j

	end if;
	
							-- get the child containing the first character past j 
	if not (exists chix in [1..nc := num_childr(rec)] | bnr_get_ch_cum(rec,chix) >= j) then 
		abort("Illegal character index in stg_past operation: " + str(j) + ", " + nc);
	end if;
	
	prev_cum := if chix = 1 then 0 else bnr_get_ch_cum(rec,chix - 1) end if;	-- the previous cumulant

	return  ("" +/ [stg_from_bigstg(bnr_voc(rec,j)): j in [1..chix - 1]]) 
												+ stg_till(j - prev_cum,bnr_voc(rec,chix));
	
end stg_till;

procedure bs_slice(rec,i,j);		-- the slice extraction operation concatenates all the characters
									-- between the two indicated positions. the 'node' can be either 
									-- the top of a tree, or can be a simple string_record
				-- for a tree, we locate the sections F, L containing the first and last characters, and
				-- return the concatenation of the part of F past i, the part of L up to j,
				-- and the concatenation of the strings associated with all intermediate child trees
--print("bs_slice: ",i," ",j);	
	if i < 1 or j < i - 1 then 
		abort("Illegal second and/or first parameters in string extraction operation" + str(j) + ", " + str(i));
	end if;
			
	if dr_load(rec)(type_byte) = string_record then		-- simple string record case

		sl := sr_length(rec);
		if j > sl then 				-- get string from record
			print("bs_slice: Illegal character index in simple string extraction operation: " + str(j) + ", " + str(sl)); stop;
		end if;
--print("bs_slice bottom: ",i," ",j," ",sr_slice(rec,i,j));		
		return sr_slice(rec,i,j); 

	end if;

							-- get the child containing the first character past i 
	if not (exists chixi in [1..nc := num_childr(rec)] | bnr_get_ch_cum(rec,chixi) >= i) then 
		abort("Illegal first character index in string extraction operation: " + str(i) + ", " + str(nc));
	end if;
							-- get the child containing the first character past j 
	if not (exists chixj in [1..nc := num_childr(rec)] | bnr_get_ch_cum(rec,chixj) >= j) then 
		abort("Illegal second character index in string extraction operation: " + str(j) + ", " + str(nc));
	end if;
	
	prev_cumi := if chixi = 1 then 0 else bnr_get_ch_cum(rec,chixi - 1) end if;	-- the cumulant previous to i
	prev_cumj := bnr_get_ch_cum(rec,chixj - 1);									-- the cumulant previous to j
	
	if chixi = chixj then 				-- proceed recursively
		return bs_slice(bnr_voc(rec,chixi),i - prev_cumi,j - prev_cumi); 
	end if;

	xxx:=(stg_past(i - prev_cumi,bnr_voc(rec,chixi)) +/[stg_from_bigstg(bnr_voc(rec,k)): k in [chixi + 1..chixj - 1]]) 
					+ stg_till(j - prev_cumj,bnr_voc(rec,chixj));	-- proceed recursively: from, middle, end


	return xxx;
	return (stg_past(i - prev_cumi,bnr_voc(rec,chixi)) +/[stg_from_bigstg(bnr_voc(rec,k)): k in [chixi + 1..chixj - 1]]) 
					+ stg_till(j - prev_cumj,bnr_voc(rec,chixj));	-- proceed recursively: from, middle, end

end bs_slice;

procedure bnr_voc(rec,j);					-- j'th member of vector of children for B_tree_for_bigstring records

	stg := dr_load(rec);					-- load this string
	cjstrt := (j - 1) * 4 + bnr_ch_start; 
	return stg(cjstrt..cjstrt + 3);		-- return child rec
	
end bnr_voc;

procedure bnr_get_ch_cum(rec,j);	-- get the cum value for the j-th child of this node
	
	return int_of_5(dr_load(rec)(bnr_cum_start + (j - 1) * 5..bnr_cum_start + j * 5 - 1));
	
end bnr_get_ch_cum;

procedure chop_up(first,mid,last);		-- chop the concatenation of three strings into a tuple of string records

	-- if all three pieces fit into one, two or three sections, return these. Otherwise join as much as possible of the 
	-- middle into two pieces, and then the reminder of the middle into approximately equal-sized pieces.
	-- first and lat are assumed to be no more than one record size each

	if (total := (nf := #first) + (nm := #mid) + (nl := #last)) <= sr_hi_lim then 		-- put all into one piece
		return [first + mid + last]; 
	end if;

	if total < 2 * sr_hi_lim then		-- two pieces are enough

		center := total/2;		-- half the total
		
		if center <= nf then 
			return [first(1..center), first(center + 1..) + mid + last]; 
		end if;

		if center <= nf + nm then 
			return [first + mid(1..used := center - nf), mid(used + 1..) + last]; 
		end if;

		return [first  + mid + last(1..used := center - nf - nm), last(used + 1..)]; 

	end if;			-- otherwise first + mid cannot fit into one piece
			
	if total <= 3 * sr_hi_lim then 			-- if they fit into 3 pieces, put as much as possible 
												-- of first and mid into one piece, and chop up the remainder
	 	return [first + mid(1..used := sr_hi_lim - nf)] + chop_up("",mid(used + 1..),last);

	end if;			-- otherwise put as much as possible of mid together with first and last, and
					-- cut the reminder of the middle into approximately equal-sized pieces.
	first := first + mid(1..used := sr_hi_lim - nf);
	last := mid((new_end := nm - (sr_hi_lim - nl)) + 1..) + last;
	
	npieces := (rem := new_end - used)/sr_hi_lim;		-- number of full-sized pieces
	psm1 := sr_hi_lim - 1;
	
	if (extra_part := rem mod sr_hi_lim) = 0 then		-- fits into a list of full pieces
	
		return [first] + [mid(j..j + psm1): j in [used + 1,used + sr_hi_lim + 1..new_end - psm1]] + [last];
	
	else												-- there is a bit left over

		start_last_2 := new_end - psm1 - extra_part;			-- starting character of last 2 sections
		mid_last_2 := start_last_2 + (psm1 + extra_part)/2;			-- middle character of last 2 sections
						
											-- use list of all but 1 full piece, plus two smaller pieces.
		return [first] + [mid(j..j + psm1): j in [used + 1,used + sr_hi_lim + 1..new_end - 2 * psm1 - extra_part]]
			 + [mid(start_last_2..mid_last_2),mid(mid_last_2 + 1..new_end)] + [last];
	
	end if;
	
end chop_up;

procedure make_record(stg);		-- creates one record from string

	rec := set_type(dr_new_rec(),string_record); sr_set_slice(rec,1,0,stg);
	return rec;
	
end make_record;

procedure pass_pass(stg); print(stg); pass(stg); end pass_pass;		-- print and pass for debugging

procedure bs_set_slice(rw rec,i,j,stg);		-- the slice assignment operation
--print("bs_set_slice: ",i," ",j," ",#stg," ",abs(dr_load(rec)(type_byte)));
	if i < 1 or j < i - 1 then 
		abort("Illegal second and first parameters in string extraction operation" + str(j) + ", " + str(i));
	end if;
											-- since rec will change, ensure that it has just one copy. 
											-- This will up the refcounts of its children if necessary
	if refcount(int_of_4(rec)) > 1 then						-- must copy
pass("copy");
		stgg := dr_load(rec); new_r := dr_new_rec(); 
		dr_setrecbuf(new_r,stgg);
		dr_dirtify(new_r);
		increfs(new_r,1); incref(rec,-1); rec := new_r;		-- substitute copy for original
	end if;

	if (contents := dr_load(rec))(type_byte) = string_record then		-- simple string record case
			-- in the simple string record case we extract the un-written-over portions of the string, 
			-- and chop up F + stg + L to get a new list of leaves, which are made into a node or a tree.
pass("simple");
--print("simple: ",i," ",j," ",#stg);
		if j > (sl := sr_length(rec)) then 				-- get string from record
			abort("bs_set_slice: Illegal character index in simple string extraction operation: " + str(j) + ", " + str(sl));
		end if;
					-- if F + stg + L fits in just one section, we perform the operation as a string slice assignment,
					-- to avoid string copying where unnecessary 

		if (new_len := (i - 1) + (lstg := #stg) + (sl - j)) <= sr_hi_lim then				-- handle as an optimized case
pass("optimized");
			sr_set_slice(rec,i,j,stg);		-- replace slice
--print("optimized case: ",i," ",j," ",sr_slice(rec,1,sr_length(rec)));			
			return;				-- done with this optimized case
			
		else 						-- we must actually chop up F + stg + L
pass("chop");					
			first := contents(sr_char_start..sr_char_start + i - 2); 	-- till the i-1-th character
			last := contents(sr_char_start + j..sr_char_start + sl - 1);		-- from the j + 1-st character
			pieces_tup := chop_up(first,stg,last);

		end if;
		
		incref(rec,-1);		-- the old form of the record will lose one reference
		
		rec := bnr_make_from_tuple(pieces_tup);		-- convert the list of sections returned into a tree
	
		return;
		
	end if;		-- otherwise we deal with the B-tree case
pass("tree");
	if i > (len_this := bnr_get_cum(rec)) + 1 or j > len_this then 
		abort("Out of range first parameter or second in string assignment operation " + str(j) + ", " + str(i) + " Length is " + str(len_this));
	end if;

	if i = len_this + 1 then		-- we have a simple concatenation operation
		
		if (lstg := #stg) < sr_low_lim then			-- balance or join with last node
		
			[last_node,lcum] := bnr_comp_cum(rec,OM);			-- get the last node, and its string contents
			ln_stg := sr_slice(last_node,1,lln := sr_length(last_node));

			if lln + lstg <= sr_hi_lim then	-- join with last node
pass("concat_join");	
				
				old_last_node := last_node;
				incref(old_last_node,1);			-- must increment to prevent length of referenced node from changing
							-- ********** TOTO: FIX THIS IN YOUR BIG_STRING CODE **********			
				sr_set_slice(last_node,lln + 1,lln,stg);		-- last_node is copied and old_last_node loses a ref
				bnr_set_comp(rec,lcum,last_node);		-- insert the (new) last node; old_last_node loses another ref
				incref(last_node,-1);				-- the variable 'last_node' is now dead

				return;									-- done with this optimized case
			end if;										-- otherwise we will balance
pass("concat_balance");
			old_last_node := last_node;
			incref(old_last_node,1);			-- must increment to prevent length of referenced node from changing
							-- ********** TOTO: FIX THIS IN YOUR BIG_STRING CODE **********			
			[new_ln_stg,stg] := chop_up(ln_stg,stg,"");	-- cut into two string sections
			sr_set_slice(last_node,1,lln,new_ln_stg);
			bnr_set_comp(rec,lcum,last_node);			-- insert the (possibly new) last node
			incref(last_node,-1);						-- the variable 'last_node' is now dead
			bnr_insert(rec,OM,leaf := make_record(stg));	-- insert extra part at the end
			incref(leaf,-1);							-- the variable 'leaf' is now dead
			return;										-- done with this optimized case
			
		end if;
pass("concat_chop");		
		pieces_tup := chop_up("",stg,"");
--print("concatenation: ",pieces_tup);		
		for piece in pieces_tup loop		-- append all these pieces to the existing record
			bnr_insert(rec,OM,leaf := make_record(piece));			-- at the end
			incref(leaf,-1);	-- the variable 'leaf' is now dead
		end loop;		-- note that the first insert may copy rec and decrement its refcount if it is shared;
						-- in this case, ref will correctly be repaced by the new copy 
				
		return;		-- done with this case

	end if;		-- end of the case in which insertion is at the very end
	
		-- otherwise we need to make an insertion into the middle of an existing big_string
	-- we extract the un-written-over portions F and L of the first and last sections affected,
	-- and chop up F + stg + L to get a new list of leaves. These replace the leaves 
	-- of the original tree, running from the first to the last section affected.
							-- get the child containing the first character past i 
	[leafi,cumi] := bnr_comp_cum(rec,i);		-- get the leaf containing the first character past i, and its cumulant 
	[leafj,cumj] := bnr_comp_cum(rec,j);		-- get the leaf containing the first character past j, and its cumulant 
--print("insertion in middle: ",cumi," ",cumj);	
			-- if cumi = cumj, and F + stg + L fits in just one section and does not fall beneath the required minimum 
			-- size, we perform the operation as a string slice assignment, to avoid string copying where unnecessary 

	if cumi = cumj			-- we may be able to work in an existing section
		and (new_len := (lstg := #stg) + (sli := sr_length(leafi)) - (j - i + 1)) <= sr_hi_lim 
			and new_len >= sr_low_lim then
pass("nocon_opt");
		prev_cum := cumi - sli;			-- cumulated length before the i-th (= j-th) leaves
		old_leafi := leafi;
		incref(old_leafi,1);			-- must increment to prevent length of referenced node from changing
							-- ********** TOTO: FIX THIS IN YOUR BIG_STRING CODE **********			

		sr_set_slice(leafi,i - prev_cum,j - prev_cum,stg);
											-- replace slice; leafi is copied, so its old version will lose 1 ref
		bnr_set_comp(rec,i,leafi);			-- reset set the leaf of rec, since it will have changed
											-- the old version now loses another ref
		incref(leafi,-1);					-- the variable leafi is now dead (compensate for extra ref added by set_comp)
		return;								-- done with this optimized case

	end if;
	
	contentsi := dr_load(leafi); sli := sr_length(leafi); irel := i - (cumi - sli);
	first := contentsi(sr_char_start..sr_char_start + irel - 2); 	-- till the i-1-th character
	
	contentsj := dr_load(leafj); slj := sr_length(leafj); jrel := j - (cumj - slj);
	last := contentsj(sr_char_start + jrel..sr_char_start + slj - 1);		-- from the j + 1-st character

	pieces_tup := chop_up(first,stg,last);

	-- now the strings in pieces_tup must be turned into leaves which replace the leaves of the tree,
	-- starting from leafj and ending with leafi. This iteration uses the cumulants and works backwards,
	-- testing for the appearance of leafi. If this is encountered before pieces_tup is exhausted, the 
	-- reminming elements of pieces_tup are inserted at the appropriate position. If pieces_tup
	-- is exhausted before leafi is encountered, the leaves forward from the one replaced up
	-- to and including leafi is deleted.
	
	-- however, the case in which pieces_tup contins just one element may be special, since this may fall below the 
	-- minimum number of characters wanted for a leaf
pass("nocon");
	if #pieces_tup = 1 and #(all := pieces_tup(1)) < sr_low_lim then
			-- we balance or join the short string we have with any left or right neighbor leaf 

		if cumi > sli then		-- there is a left neighbor
pass("nocon_left");
			[left_neighbor,-] := left_pair := bnr_comp_cum(rec,cumi - sli);	-- get the left neighbor and its cumulant
			left_neighbor_stg := sr_slice(left_neighbor,1,sr_length(left_neighbor));
			pieces_tup := chop_up(left_neighbor_stg,all,"");		-- use this as the new pieces_tup
			[leafi,cumi] := left_pair;								-- use left_neighbor as the leafi 
			
		elseif cumj < len_this then 						-- there is a right neighbor
pass("nocon_right");
			[right_neighbor,-] := right_pair := bnr_comp_cum(rec,cumj + 1);	-- get the right neighbor and its cumulant
			right_neighbor_stg := sr_slice(right_neighbor,1,sr_length(right_neighbor));
			pieces_tup := chop_up("",all,right_neighbor_stg);		-- use this as the new pieces_tup
			[leafj,cumj] := right_pair;								-- use left_neighbor as the leafi 
		end if;
		
	end if;		-- after this the treatment is like that of all other cases
	
	cum_now := cumj;						-- the following iteration is governed by the cumulants
	pieces_tup_ix_now := #pieces_tup;		-- start at the end of pieces_tup

	while cum_now >= cumi and pieces_tup_ix_now >= 1 loop
pass("nocon_iter");		
		oll := sr_length(bnr_comp(rec,cum_now));		-- the leaf which will be replaced
		
		bnr_set_comp(rec,cum_now,leaf := make_record(pieces_tup(pieces_tup_ix_now)));
		incref(leaf,-1);	-- the variable 'leaf' is now dead
--print("chopping up: ",#pieces_tup," ",hexify(leaf));			
		pieces_tup_ix_now -:= 1;		-- backwards in pieces_tup
		cum_now -:= oll;			-- backwards in the sequence of leaves
		
	end loop;

	while cum_now >= cumi loop			-- may need to make extra deletions
pass("nocon_del");		
		oll := sr_length(bnr_comp(rec,cum_now));		-- the leaf which will be replaced
		bnr_set_comp(rec,cum_now,OM);
		cum_now -:= oll;			-- backwards in the sequence of leaves
	end loop;
	
	just_past_prev := cum_now + 1;		-- just past the leaf previous to leafi
	
	while pieces_tup_ix_now >= 1 loop		-- may need to make extra insertions
pass("nocon_ins");		
		bnr_insert(rec,just_past_prev,leaf := make_record(pieces_tup(pieces_tup_ix_now)));
		incref(leaf,-1);	-- the variable 'leaf' is now dead
		
		pieces_tup_ix_now -:= 1;		-- backwards in pieces_tup
		
	end loop;
	
	if num_childr(old_rec := rec) = 1 then			-- move from tree to simple string record representation
pass("nocon_to_simp");
		rec := bnr_voc(rec,1); incref(rec,1); incref(old_rec,-1);
				-- the child inherits one reference from its parent, so its refcount does not change
	end if;
--print("tree_dump after: ",str(bnr_dump(rec)));

end bs_set_slice;

procedure bs_length(rec);		-- length of a bigstring = string record length or tree length

	return if dr_load(rec)(type_byte) = string_record then sr_length(rec) else bnr_get_cum(rec) end if;

end bs_length;
	
procedure print_raw(); print(big_ix); end print_raw;			-- raw print for debugging

procedure bs_check_leaves(rec);		-- checks that leaves have required maximum and minimum lengths
	
	if (refcount(int_of_4(rec)) <= 0) then
		print("**** MEMORY ERROR **** node with zero refcount in tree: ",hexify(rec)); return false;
	end if;

	if (contents := dr_load(rec))(type_byte) = string_record then	-- node is a leaf

		if (srl := sr_length(rec)) > sr_hi_lim then 
			print("EXCESSIVE STRING LENGTH DETECTED IN NODE ",hexify(rec)," ",hexify(contents));
			return false;
		end if;
		
		return true;		-- node is ok
	end if;		-- otherwise we proceed recursively
	
	if (bsl := bs_length(rec)) /= (ls := #stg_from_bigstg(rec)) then 
		print("***** Length Discrepancy ***** ",bsl," - ",ls," -> ",hexify(rec)," ",hexify(dr_load(rec))); stop;
	end if; 

	return forall j in [1..num_childr(rec)] | bs_check_leaves_in(bnr_voc(rec,j));

procedure bs_check_leaves_in(rec);		-- inner workhorse
	
	if (refcount(int_of_4(rec)) <= 0) then
		print("**** MEMORY ERROR **** node with zero refcount in subtree: ",hexify(rec)); return false;
	end if;

	if (contents := dr_load(rec))(type_byte) = string_record then	-- node is a leaf

		if (srl := sr_length(rec)) > sr_hi_lim then 
			print("EXCESSIVE STRING LENGTH DETECTED IN LEAF ",hexify(rec)," ",srl," ",hexify(contents));
			return false;
		end if;

		if srl < sr_low_lim then 
			print("INSUFFICIENT STRING LENGTH DETECTED IN LEAF ",hexify(rec)," ",srl," ",hexify(contents));
			return false;
		end if;

		return true;		-- leaf is ok

	end if;		-- otherwise we proceed recursively
	
	if (bsl := bs_length(rec)) /= (ls := #stg_from_bigstg(rec)) then 
		print("***** Length Discrepancy In Leaf***** ",bsl," ",ls); stop;
	end if; 

	return forall j in [1..num_childr(rec)] | bs_check_leaves_in(bnr_voc(rec,j));

end bs_check_leaves_in;

end bs_check_leaves;

end big_string_pak;

class big_string;				-- simple wrapper object for big_string_pak

	var rec;	-- disk record for big_string; this is either as simple string record or a B-tree root record 
				-- (MADE PUBLIC FOR MEMORY USAGE DEBUGGING ONLY)

	procedure create(stg);		-- create a big string from a string
	procedure print_raw(); 			-- raw print for debugging
end big_string;

class body big_string;			-- simple wrapper object for big_string_pak

use setldb,big_string_pak;
	
	procedure create(stg); rec := bs_from_stg(stg); end create;		-- create a big string from a string

	procedure selfstr; return stg_from_bigstg(rec);	end;	-- string form of  a big string

	procedure self(i..j);		-- the slice extraction operation concatenates 
								-- all the characters between the two indicated positions
		return bs_slice(rec,i,j);
	end;
	
	procedure self(i..j) := stg;		-- slice assignment operation

		bs_set_slice(rec,i,j,stg);		-- this can change rec; when the refcount of this object goes to 0, 
										-- rec should be erased
	end;
	
	procedure #self; return bs_length(rec);	end;	-- length of this string
	
	
	procedure print_raw(); print(big_ix); end print_raw;
			-- raw print for debugging
end big_string;

program test_bs;		-- test program for big_string package and its associated class
use setldb,big_string_pak,big_string;
use byteutil,disk_records_pak,db_records,B_tree_for_bigstring,string_utility_pak;
				
var orig := "123456789a123456789b123456789c" + "123456789d123456789e123456789f" + "123456789g123456789h123456789i"
			 + "123456789j123456789k123456789lABCDE";		-- string for tests
var orig2 := "ABCDEFGHIaABCDEFGHIbABCDEFGHIc" + "ABCDEFGHIdABCDEFGHIeABCDEFGHIf" + "ABCDEFGHIg1ABCDEFGHIhABCDEFGHIi"
			 + "ABCDEFGHIjABCDEFGHIkABCDEFGHIzZZZZZ";		-- string for tests

code_pts := bs_code_pts + bnr_code_pts;		-- code points to be traversed

database_file_name := "bs_test_file";		-- set the name of the file to be used

package_tests;					-- test the underlying package
big_string_class_test;			-- test the class version

report_points_passed();			-- get traversal report

procedure check_mem(caption,rec);		-- memory check utility
	incref(rec,-1); print(caption," ",str(if is_tuple(cm := check_memory()) then cm(1) else cm end if));
end check_mem;

procedure package_tests;		-- tests of the underlying package

s2 := stg_from_bigstg(rc := bs_from_stg(s1 := orig)); print("short reconstruction: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 1",rc);
s2 := stg_from_bigstg(rc := bs_from_stg(s1 := orig + orig(1..10))); print("long reconstruction: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 2",rc);
s2 := stg_from_bigstg(rc := bs_from_stg(s1 := 5 * orig + orig(1..10))); print("longer reconstruction: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 3",rc);
s2 := stg_from_bigstg(rc := bs_from_stg(s1 := 5 * orig)); print("longer reconstruction 2: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 4",rc);
s2 := stg_from_bigstg(rc := bs_from_stg(s1 := 2 * orig + orig(1..10))); print("longer reconstruction 3: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 5",rc);

s2 := stg_past(5,rc := bs_from_stg(s1 := orig)); print("end of short reconstruction: ",s1(5..) = s2);
check_mem("memcheck 6",rc);
s2 := stg_past(5,rc := bs_from_stg(s1 := orig + orig(1..10))); print("end of long reconstruction: ",s1(5..) = s2);
check_mem("memcheck 7",rc);
s2 := stg_past(5,rc := bs_from_stg(s1 := 5 * orig + orig(1..10))); print("end of longer reconstruction: ",s1(5..) = s2);
check_mem("memcheck 8",rc);
s2 := stg_past(5,rc := bs_from_stg(s1 := 5 * orig)); print("end of longer reconstruction 2: ",s1(5..) = s2);
check_mem("memcheck 9",rc);
s2 := stg_past(5,rc := bs_from_stg(s1 := 2 * orig + orig(1..10))); print("end of longer reconstruction 3: ",s1(5..) = s2);
check_mem("memcheck 10",rc);

s2 := stg_till(95,rc := bs_from_stg(s1 := orig)); print("start of short reconstruction: ",s1(1..95) = s2);
check_mem("memcheck 11",rc);
s2 := stg_till(130,rc := bs_from_stg(s1 := orig + orig(1..10))); print("start of long reconstruction: ",s1(1..130) = s2);
check_mem("memcheck 12",rc);
s2 := stg_till(630,rc := bs_from_stg(s1 := 5 * orig + orig(1..10))); print("start of longer reconstruction: ",s1(1..630) = s2);
check_mem("memcheck 13",rc);
s2 := stg_till(620,rc := bs_from_stg(s1 := 5 * orig)); print("start of longer reconstruction 2: ",s1(1..620) = s2);
check_mem("memcheck 14",rc);
s2 := stg_till(255,rc := bs_from_stg(s1 := 2 * orig + orig(1..10))); print("start of longer reconstruction 3: ",s1(1..255) = s2);
check_mem("memcheck 15",rc);

s2 := bs_slice(rc := bs_from_stg(s1 := orig),5,95); print("short slice: ",s1(5..95) = s2);
check_mem("memcheck 16",rc);
s2 := bs_slice(rc := bs_from_stg(s1 := orig + orig(1..10)),5,130); print("long slice: ",s1(5..130) = s2);
check_mem("memcheck 17",rc);
s2 := bs_slice(rc := bs_from_stg(s1 := 5 * orig + orig(1..10)),5,630); print("longer slice: ",s1(5..630) = s2);
check_mem("memcheck 18",rc);
s2 := bs_slice(rc := bs_from_stg(s1 := 5 * orig),5,620); print("longer slice 2: ",s1(5..620) = s2);
check_mem("memcheck 19",rc);
s2 := bs_slice(rc := bs_from_stg(s1 := 2 * orig + orig(1..10)),5,255); print("longer slice 3: ",s1(5..255) = s2);
check_mem("memcheck 20",rc);

rc := bs_from_stg(s1 := 4 * orig + orig(1..10)); 
bs_set_slice(rc,3,500,"");	s1(3..500) := "";	-- test slice insertion operation, long form
s2 := stg_from_bigstg(rc); print("long slice shortening insert: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 21",rc);

rc := bs_from_stg(s1 := orig + orig(1..10)); 
bs_set_slice(rc,3,2,"XX");	s1(3..2) := "XX";	-- test slice insertion operation, long form
s2 := stg_from_bigstg(rc); print("long slice insert: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 22",rc);

rc := bs_from_stg(s1 := orig + orig(1..10)); 
bs_set_slice(rc,3,2,3 * orig2);s1(3..2) := 3 * orig2;		-- test slice insertion operation, long form
s2 := stg_from_bigstg(rc); print("long slice long insert: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 23",rc);

rc := bs_from_stg(s1 := orig);
bs_set_slice(rc,1,3,"XX");	s1(1..3) := "XX";	-- test slice assignment operation, short form
s2 := stg_from_bigstg(rc); print("short slice shrink-a-bit: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 24",rc);

rc := bs_from_stg(s1 := orig);
bs_set_slice(rc,1,3,""); s1(1..3) := "";		-- test slice assignment operation, short form
s2 := stg_from_bigstg(rc); print("short slice shrinkage: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 25",rc);

rc := bs_from_stg(s1 := orig);
bs_set_slice(rc,1,3,"YYYYY");	s1(1..3) := "YYYYY";	-- test slice assignment operation, short form
s2 := stg_from_bigstg(rc); print("short slice expand: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 26",rc);

rc := bs_from_stg(s1 := orig);
bs_set_slice(rc,1,0,"ZZZZZ");	s1(1..0) := "ZZZZZ";	-- test slice assignment operation, short form
s2 := stg_from_bigstg(rc); print("short slice insert: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 27",rc);

rc := bs_from_stg(s1 := orig + orig(1..10)); rl := bs_length(rc);
bs_set_slice(rc,rl + 1,rl,"XX"); s1(rl + 1..rl) := "XX";		-- test slice append operation, short form
s2 := stg_from_bigstg(rc); print("long slice append: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 28",rc);

rc := bs_from_stg(s1 := 2 * orig + orig(1..10)); rl := bs_length(rc);
bs_set_slice(rc,rl + 1,rl,"WWWWWWW"); s1(rl + 1..rl) := "WWWWWWW";		-- test slice append operation, long form
s2 := stg_from_bigstg(rc); print("long slice append 2: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 29",rc);

rc := bs_from_stg(s1 := 5 * orig + orig(1..10)); rl := bs_length(rc);
bs_set_slice(rc,rl + 1,rl,"YYYYY"); s1(rl + 1..rl) := "YYYYY";		-- test slice append operation, longer form
s2 := stg_from_bigstg(rc); print("long slice append 3: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 30",rc);

rc := bs_from_stg(s1 := 5 * orig + orig(1..10)); rl := bs_length(rc);
bs_set_slice(rc,rl + 1,rl,3 * orig); s1(rl + 1..rl) := 3 * orig;		-- test slice append operation, longer form
s2 := stg_from_bigstg(rc); print("long slice long append: ",s1 = s2 and bs_check_leaves(rc));
check_mem("memcheck 30A",rc);

-- additional tests of the slice assignment operations, to proble all the special cases
-- in the code, and verify the absence of memory leaks or faulty deallocations

end package_tests;

procedure big_string_class_test;		-- test program for big_string class

the_stg := big_string(s1 := orig);
print("string length and selfstr: ",#the_stg = #s1," ",str(the_stg) = s1);
print("short slice class: ",the_stg(5..95) = s1(5..95) and bs_check_leaves(the_stg.rec));
check_mem("memcheck 31",the_stg.rec);

the_stg := big_string(s1 := orig + orig(1..10));
print("long slice class: ",the_stg(5..130) = s1(5..130) and bs_check_leaves(the_stg.rec));
check_mem("memcheck 32",the_stg.rec);

the_stg := big_string(s1 := 5 * orig + orig(1..10));
print("longer slice class: ",the_stg(5..630) = s1(5..630) and bs_check_leaves(the_stg.rec));
check_mem("memcheck 33",the_stg.rec);

the_stg := big_string(s1 := 5 * orig);
print("longer slice class 2: ",the_stg(5..620) = s1(5..620) and bs_check_leaves(the_stg.rec));
check_mem("memcheck 34",the_stg.rec);

the_stg := big_string(s1 := 2 * orig + orig(1..10));
print("longer slice class 3: ",the_stg(5..255) = s1(5..255) and bs_check_leaves(the_stg.rec));
check_mem("memcheck 35",the_stg.rec);

the_stg := big_string("");
the_stg(1..0) := 5 * "abcde"; print("insert from null: ",str(the_stg) = 5 * "abcde"); 
the_stg(26..25) := 5 * orig; print("tail insert from null: ",
				str(the_stg) = (5 * "abcde" + 5 * orig) and bs_check_leaves(the_stg.rec)); --
check_mem("memcheck 36",the_stg.rec);

the_stg := big_string(ori := ori_copy := 5 * "abcde" + 5 * orig);
--print("the_stg after creation: ",str(the_stg)," --- the_stg: ",the_stg); -- ?????????? SETL PRINT BUG ??????????
--print("the_stg after creation: ",str(the_stg)); -- THIS WORKS OK; PREVIOUS LINE DOES NOT
print("string length, selfstr, length check,selfstr check: ",#the_stg = #ori," ",str(the_stg) = ori);
stg_copy := the_stg; incref(the_stg.rec,1);		-- note that an extra copy has been created

print("slice extraction and original: ",the_stg(1..10) = ori(1..10) and bs_check_leaves(the_stg.rec)); 

print("second slice extraction and original: ",the_stg(10..20) = ori(10..20));
print("slice extraction check: ",the_stg(1..10) = ori(1..10)); 
print("second slice extraction check: ",the_stg(10..20) = ori(10..20));
print("tail slice check: ",the_stg(10..#ori) = ori(10..#ori));

the_stg(1..0) := (extra := "XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX."); ori2 := ori;
ori(1..0) :=  extra;
print("concatenation of 50 characters at start: ",#the_stg," ",str(the_stg) = ori and bs_check_leaves(the_stg.rec)); 

print("the_stg: ",str(the_stg)); print(); print(str(stg_copy));
incref(the_stg.rec,-1);		-- delete the string

	-- we must try long and short pure insertions, long and short impure insertions 
	-- into a single section, and long and short insertions which erase longer 
	-- runs
the_stg := stg_copy; incref(stg_copy.rec,1); ori := ori_copy; ori(1..0) := "XXX";
the_stg(1..0) := "XXX"; print("insertion of 3 characters at start: ",#the_stg," ",str(the_stg) = ori);
		-- pure short insertion case
incref(the_stg.rec,-1);		-- delete the string

the_stg := stg_copy;  incref(stg_copy.rec,1); ori := ori_copy; ori(21..20) := "XXX";
the_stg(21..20) := "XXX"; print("insertion of 3 characters after position 20: ",#the_stg," ",str(the_stg) = ori);		-- pure short insertion case
incref(the_stg.rec,-1);		-- delete the string

the_stg := stg_copy;  incref(stg_copy.rec,1); ori := ori_copy; ori(10..12) := "XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.";
the_stg(10..12) := "XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX."; 
print("50 Xs replace chars 10 thru 12: ",#the_stg," ",str(the_stg) = ori);
incref(the_stg.rec,-1);		-- delete the string

the_stg := stg_copy;  incref(stg_copy.rec,1); ori := ori_copy; 
the_stg(10..400) := "XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.";  ori(10..400) := "XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.";
print("50 Xs replace chars 10 thru 400: ",#the_stg," ",str(the_stg) = ori);		--the_stg.print_raw();
incref(the_stg.rec,-1);		-- delete the string

the_stg := stg_copy;  incref(stg_copy.rec,1); ori := ori_copy; ori(10..12) := "XXX";
the_stg(10..12) := "XXX"; print("3 Xs replace chars 10 thru 12: ",#the_stg," ",str(the_stg) = ori);
incref(the_stg.rec,-1);		-- delete the string

the_stg := stg_copy;  incref(stg_copy.rec,1); ori := ori_copy; ori(150..625) := "XXX";
the_stg(150..625) := "XXX"; print("3 Xs replace chars 150 thru 625: ",#the_stg," ",str(the_stg) = ori);
incref(the_stg.rec,-1);		-- delete the string

the_stg := stg_copy;  incref(stg_copy.rec,1); ori := ori_copy; ori(10..400) := "XXX";
the_stg(10..400) := "XXX"; print("3 Xs replace chars 10 thru 40: ",#the_stg," ",str(the_stg) = ori);
incref(the_stg.rec,-1);		-- delete the string

the_stg := stg_copy;  incref(stg_copy.rec,1); ori := ori_copy; ori(10..12) := "";
the_stg(10..12) := ""; print("chars 10 thru 12 deleted: ",#the_stg," ",str(the_stg) = ori);		-- pure deletion case
incref(the_stg.rec,-1);		-- delete the string

the_stg := stg_copy;  incref(stg_copy.rec,1); ori := ori_copy; ori(10..400) := "";
the_stg(10..400) := ""; print("chars 10 thru 40 deleted: ",#the_stg," ",str(the_stg) = ori);		-- pure long deletion case
incref(the_stg.rec,-1);			-- delete the string

print("first character deletion test"); ori := ori_copy; 
the_stg1 := stg_copy; incref(stg_copy.rec,1); the_stg1(1..1) := ""; ori(1..1) := ""; print(str(the_stg1) = ori);
the_stg2 := the_stg1; incref(the_stg1.rec,1); the_stg2(1..1) := ""; ori(1..1) := ""; print(str(the_stg2) = ori);
the_stg3 := the_stg2; incref(the_stg2.rec,1); the_stg3(1..1) := ""; ori(1..1) := ""; print(str(the_stg3) = ori);
incref(the_stg3.rec,-1); incref(the_stg2.rec,-1); incref(the_stg1.rec,-1); 
 		-- delete the string copy

check_mem("memcheck 37",stg_copy.rec);		-- final erasure

end big_string_class_test;

end test_bs;
	
--	************** THE MAIN DATABASE CLASS AND ITS AUXILIARY CLASSES **************

	-- Internally, the database 'records' are long string sections, defined 
	-- by their start, end characters [start, the_end]. These are accessed through an index, 
	-- which is a cumulating vector of pairs [record_id, rec_len] with both record_id and cum_len
	-- in increasing order. record_id cumulates using the maximum function, and rec_len by integer addition.
	
	-- Each database has a to_string method which can be applied to each of its 
	-- records, which in turn generates word_list(record); This word list is 
	-- used to insert all the word occurences in the record into a 'word index' 
	-- associated with the database.
	
	-- To find a record in the database given its key, we write  
	-- 		[[key,rec_len],key_ix,cum] := big_ix{key..key_cum}; 
	--		start := cum - rec_len + 1;
		
	-- Insertions of new records into the database are simply 
	--		start := #db_record_string; db_record_string(start + 1..start) := binstr(record); 
	-- where 'record' is the new record, followed by 
	-- 		id_ctr +:= 1; db_index{OM} := [hex_rep(id_ctr),#db_record_string];
	-- and then by insertion of the word occurences in the record
	-- into the word index.
	
	-- To over_write a record within the database string db, we write
	-- db_record_string(start..the_end) := x := binstr(record), followed by
	-- db_index(key_ix) := [key,#x]; and by appropriate revision of the word index.
	
	-- Database deletions are then db_record_string(start..the_end) := "", followed by 
	-- big_ix(key_ix) := OM; and by deletion of the word occurences in the record
	-- from the word index.

package db_iterator_pak;		-- contains_iterator package for the SETL database class

	const 
		simple := 1, union := 2, difference := 3, intersection := 4;
	
	sel iterator_kind(1);			-- can be simple, union, difference, unstarted intersection,
										-- started intersection 
					-- components of simple iterators
	sel occs_tree(2);				-- big tuple containing the record keys referencing the words in the_word_ix
	sel current_occ_ptr(3);			-- pointer to index of current occurence
	sel occs_start(4);				-- first item of occurence list
	sel occs_end(5);				-- last item of occurence list
	
					-- components of compound iterators
	sel L1_child(2),L2_child(3); 	-- sub-iterators if compound iteration
	sel last_L1(4),last_L2(5); 	-- items previously obtained for this cycle of 
										-- compound iteration

	var written_refcount,written_free_list,written_in_use;		-- DEBUGGING QUANTITIES, PALCED HER FOR CONVENIENCE
	
	procedure itr_create(occs_tup,beg,nd);			-- create a simple contains_iterator for a SETL database
	procedure itr_union(iter1,iter2);				-- iterator union
	procedure itr_intersection(iter1,iter2);		-- iterator intersection
	procedure itr_diff(iter1,iter2);				-- iterator difference
	procedure itr_start(tup);						-- begin iterating over records containing word
	procedure itr_next(tup);						-- continue iterating over records containing word
	procedure itr_number(tup);						-- count operator
	procedure itr_arb(tup);							-- arb operator; returns first element
	procedure itr_destroy(tup);						-- destructor routine; eliminates reference to occs_tup

end db_iterator_pak;

package body db_iterator_pak;	-- contains_iterator package for the SETL database class

use setldb,B_tree_for_wdocstring,big_stg_for_wdoc_pak,byteutil,disk_records_pak,db_records;
						-- the underlying record and paging libraries 

procedure itr_create(occs_tup,beg,nd);		-- create a simple contains_iterator for a SETL database
--print("itr_create: ",beg," ",nd);
	tup := [1..3];				-- create the representing tuple. The last 2 components, occs_start and occs_end, are OM
	tup.iterator_kind := simple;
	tup.current_occ_ptr := newat();		-- initial reference is to OM
	tup.occs_tree := occs_tup;			-- retain occurence list identity, and create an extra reference to it
	incref(occs_tup,1);		--  add one reference to occs_tup
	if beg = OM then return tup; end if;		-- return a null iterator
	
			-- search for the desired word. If it is not found, set current_oc = OM;
			-- this will terminate iteration as soon as it begins. Otherwise set ^current_occ_ptr 
			-- to point to the first identifier in the identifiers list following the 
			-- word (or to OM if this list is empty.) Iteration will terminate as soon as
			-- the next word item in the word index is encountered.
			
			-- The word index is maintained as a vector of items word, num_occs, ...
			-- etc. with the number of words as a cumulant
	tup.occs_end := nd;
	^tup.current_occ_ptr := (tup.occs_start := beg) - 1;	-- first item of occurence list
	
--print("created iterator: ",tup);
	return tup;

end itr_create;

procedure itr_destroy(tup);						-- destructor routine; eliminates a reference to occs_tup
	incref(tup.occs_tree,-1);		-- eliminate one reference to occs_tup
	tup.current_occ_ptr := OM;		-- prevent subsequent iteration
end itr_destroy;

procedure itr_union(iter1,iter2);		-- iterator union
										-- last_L1(4),last_L2(5) are initialized to pointer atoms
	new_iter := [OM,OM,OM,newat(),newat()]; 	-- create an uninitialized new iterator
	new_iter.iterator_kind := union;
	new_iter.L1_child := iter1;		-- set the two children
	new_iter.L2_child := iter2;
	
	return new_iter;

end itr_union;

procedure itr_intersection(iter1,iter2);		-- iterator intersection
										-- last_L1(4),last_L2(5) are initialized to pointer atoms
	
	new_iter := [OM,OM,OM,newat(),newat()];			-- create an uninitialized new iterator
	new_iter.iterator_kind := intersection;
	new_iter.L1_child := iter1;		-- set the two children
	new_iter.L2_child := iter2;
	
	return new_iter;

end itr_intersection;

procedure itr_diff(iter1,iter2);		-- iterator difference
										-- last_L1(4),last_L2(5) are initialized to pointer atoms
	
	new_iter := [OM,OM,OM,newat(),newat()];	 	-- create an uninitialized new iterator
	new_iter.iterator_kind := difference;

	new_iter.L1_child := iter1;		-- set the two children
	new_iter.L2_child := iter2;
	
	return new_iter;

end itr_diff;

procedure itr_start(tup);			-- begin iterating over records containing word

	case tup.iterator_kind

	when simple =>		-- restart the iteration if possible

		if tup.occs_start /= OM then ^(tup.current_occ_ptr) := tup.occs_start - 1; end if;

	when union,difference,intersection =>	-- for the compound cases, just set up both values

		itr_start(tup.L1_child); itr_start(tup.L2_child);			-- start both children

		^(tup.last_L1) := itr_next(tup.L1_child);					-- set up both initial values
		^(tup.last_L2) := itr_next(tup.L2_child);
	end case;

end itr_start;

procedure itr_next(tup);			-- continue iterating over records containing word
	
	case tup.iterator_kind

	when simple =>
						-- if uninitialized or terminated, return OM
		if (tcop := tup.current_occ_ptr)  = OM or ^tcop = OM then return OM; end if;

		if (current_ix := (^tcop +:= 1)) > tup.occs_end  then 
			^tcop := OM; return OM;		 -- iteration ends at occs_end
		end if;
			
		return int_of_4(bswo_comp(tup.occs_tree,current_ix));
						-- otherwise return the current record identifier, as an integer

	when union =>	-- for the union, we simply return the smaller of the two 
					-- items, and advance it

		if (las_1 := ^(tup.last_L1)) = OM then
		
			if ^(tup.last_L2) = OM then return OM; end if;		-- iteration is finished
			to_return := ^(tup.last_L2);
			^(tup.last_L2) := itr_next(tup.L2_child);	-- advance the L2 iteration
			return to_return;		-- return the L2 iterator
			
		elseif (las_2 := ^(tup.last_L2)) = OM then

			to_return := ^(tup.last_L1);
			^(tup.last_L1) := itr_next(tup.L1_child);	-- advance the L1 iteration
			return to_return;		-- return the L1 iterator

		elseif las_1 <= las_2 then

			to_return := las_1;		-- return the smaller

			if las_1 = las_2 then ^(tup.last_L2) := itr_next(tup.L2_child); end if;
				-- advance L2 if it equals the L1 being returned
				
			^(tup.last_L1) := itr_next(tup.L1_child);	-- advance the smaller iteration 
			return to_return;		-- return the L1 iterator

		else

			to_return := las_2;		-- return the smaller
			^(tup.last_L2) := itr_next(tup.L2_child);	-- advance the smaller iteration
			return to_return;		-- return the L2 iterator

		end if;
		
	when difference =>	-- for the difference L1 - L2, we return and advance the L1 item if 
						-- it is the smaller of the two; otherwise we advance the L2 item
						-- until it becomes equal to or greater than the L1 item. At each
						-- equality, the L1 item is also increased.
--print("difference iter; last_L1,last_L2: ",^(tup.last_L1)," ",^(tup.last_L2));

		if (las_1 := ^(tup.last_L1)) = OM then return OM; end if;		-- iteration is finished
		
		if (las_2 := ^(tup.last_L2)) = OM or las_1 < las_2 then

			to_return := las_1;		-- return the L1 iterator
			^(tup.last_L1) := itr_next(tup.L1_child);	-- advance the L1 iteration 
			return to_return;		-- return the L1 iterator

		elseif las_1 = las_2 then	-- bypass common element

			^(tup.last_L1) := itr_next(tup.L1_child);	-- advance the L1 iteration 
			
		end if;		-- now last_L1 is larger
		
		while (las_1 := ^(tup.last_L1)) /= OM loop	-- we move forward (in both lists if necessary)
									-- till last_L2 exceeds last_L1

			while (las_2 := ^(tup.last_L2)) /= OM and las_1 > las_2 loop
				^(tup.last_L2) := itr_next(tup.L2_child);
			end loop;

			if las_1 = las_2 then 	-- must advance last_L1 again
				^(tup.last_L1) := itr_next(tup.L1_child);
				continue; 
			end if;	
			
			to_return := las_1;		-- return the L1 iterator
			^(tup.last_L1) := itr_next(tup.L1_child);	-- advance the L1 iteration 
			return to_return;		-- return the L1 iterator

		end loop;

		return OM;		-- iteration is finished

	when intersection =>	-- for the intersection, we advance the smaller of the two 
							-- items until a common value is found, when we advance both

		while (las_1 := ^(tup.last_L1)) /= OM and (las_2 := ^(tup.last_L2)) /= OM loop
			
			if las_1 = las_2 then -- we have an intersection element
			
				to_return := las_1;		-- return the L1 iterator
				^(tup.last_L1) := itr_next(tup.L1_child);	-- advance the L1 iteration 
				return to_return;		-- return the L1 iterator
			
			end if;
			
			if las_1 < las_2 then -- advance the smaller element
				^(tup.last_L1) := itr_next(tup.L1_child);	-- advance the L1 iteration 
			else
				^(tup.last_L2) := itr_next(tup.L2_child);	-- advance the L2 iteration 
			end if;
			
		end loop;

		return OM;		-- iteration is finished

	end case;

end itr_next;

procedure itr_number(tup);		-- count operator

	case tup.iterator_kind

	when simple =>

		if tup.occs_start = OM then return 0; end if;		-- word not found
		
		return tup.occs_end - tup.occs_start + 1;
	
	when union,difference,intersection =>
		-- count occurences matching criterion
		
		num := 0; itr_start(tup);
		while itr_next(tup) /= OM loop num +:= 1; end loop;
		return num;

	end case;

end itr_number;

procedure itr_arb(tup);		-- arb operator; returns first element

--print("itr_arb: ",tup.iterator_kind);	
	case tup.iterator_kind

	when simple =>		-- restart the iteration if necessary

		if (tos := tup.occs_start) = OM then return OM; end if;	
		iof := int_of_4(bswo_comp(tup.occs_tree,tos));
		
		return iof;
				-- otherwise return the first record identifier from the sequence for this word

	when union,difference,intersection =>	-- for the compound cases, just set up both values

		itr_start(tup);			-- begin iterating over records containing word
		item := itr_next(tup);		-- take the first item
		
		return if item = OM then OM else item end if;	-- return the first item
		
	end case;

end itr_arb;

end db_iterator_pak;

class contains_iterator;		-- contains_iterator for the SETL database class
	-- these iterators represent the set of all database records which 
	-- contain a given word. They support iteration and the count operation; 
	-- along with intersection, union, and difference.

procedure create(the_tup);		-- create contains_iterator for a SETL database

end contains_iterator;

class body contains_iterator;		-- contains_iterator for the SETL database class

use setldb,byteutil,disk_records_pak,db_records,db_iterator_pak;		-- for preliminary testing

var tuple;				-- representing quintuple for the iteration

procedure create(the_tup);		-- create contains_iterator for a SETL database
	-- tuple is [iterator_kind = kind,occs_tree,current_occ_ptr = OM,start,end] if simple;
	-- (but [kind,L1_child,L2_child,last_L1,last_L2] if compound)
	
	tuple := if the_tup.iterator_kind = simple then itr_create(the_tup(2),the_tup(4),the_tup(5)) else the_tup end if;
						-- use the package in the simple case; otherwise just save

end create;

procedure #self; return itr_number(tuple); end;				-- count operator; use the package
procedure self + x; return contains_iterator(itr_union(tuple,x.tuple)); end;			-- iterator union
procedure self * x; return contains_iterator(itr_intersection(tuple,x.tuple)); end;		-- iterator intersection
procedure self - x; return contains_iterator(itr_diff(tuple,x.tuple)); end;				-- iterator difference
procedure arb self; return if (res := itr_arb(tuple)) = OM then OM else stg_of_4(res) end if; end;
				-- select an arbitrary record id which references this word or words
				-- Note that itr_arb returns OM  or an integer

procedure iterator_start; itr_start(tuple); end;		-- begin iterating over records containing word
procedure iterator_next; return if (res := itr_next(tuple)) = OM then OM else [stg_of_4(res)] end if; end;
							-- continue iterating over records containing word

end contains_iterator;

-- The operations supported by the SETL database class seen below are as follows:

-- db(key);				-- find a database record given its key
-- db(key) := x;		-- store a database record, associating it with the specified key
-- db with x			-- create a new record and put x into it. db is returned.
-- db.to_string;		-- database-associated mapping of records to strings; see comment below

-- db.contains(wd);		-- creates and returns an iterator object for the given word
	-- iterations 'x in iter_obj' over these objects return the sequence of 
	-- keys of all records containing the specified words. These iteration 
	-- objects support the operations: #iter_obj (number of keys in the iteration group)
	-- arb iter_obj (first item int the iteration group, or OM if none)
	-- iter_obj + iter_obj (union iteration), iter_obj * iter_obj (intersection),
	-- iter_obj - iter_obj (iterate over difference)

class database;		-- SETL database class

var what_tree;				-- tree name for debugging
var to_string := OM;		-- database-associated mapping of records to strings
							-- this public variable should be set just once, immediately after the 
							-- database is created.
var words_just_changed;		-- words affected in last database operation 

var record_string;		-- the big string of all the records
var record_index;		-- maps record identifiers into record ends
	-- this is a big tuple consisting of record identifiers and record lengths, 
	-- with the identifiers in order. The identifiers cumulate as strings, the lengths as integers.
var identifier_generator := 0;		-- cumulating generator for record identifiers
var words_index;
	-- this is a big tuple consisting of pairs [words,len_occ_rec], where len_occ_rec is 
	-- the length (in 'occ_rec_tup', see below) of the list of records which contain the word.
	-- the words are in order
var occ_rec_tup;		-- big tuple, containing lists of word occurences (one for each
						-- record in which a given word occurs. Each such list of words contains
						-- record identifiers in increasing order

procedure create(my_to_string,file_name);		-- create or reopen a SETL database
procedure write_db();							-- write database to its file
procedure contains(wd);
		-- returns the 'contains' iterator object for a given word

procedure dump();					-- dump of database (DEBUGGING ONLY)
procedure check_consistency(caption);		-- miscelaneous consistency checks (DEBUGGING ONLY)
procedure dump_recstring();			-- dump of database record_string (DEBUGGING ONLY)
procedure write_refcount_data();	-- write out the refcount data (PUBLIC FOR DEBUGGING ONLY)
procedure dump2(x);

end database;

class body database;		-- SETL database class

use setldb,string_utility_pak,sort_pak;	-- utilities, e.g. for breaking strings at whitespace, etc.
use contains_iterator;				-- the iterator objects
use byteutil,disk_records_pak,db_records;			-- the underlying record and paging libraries 
use B_tree_for_bigstring,B_tree_for_wdocstring,B_tree_for_dbix,B_tree_for_wdix,big_string_pak,big_stg_for_wdoc_pak;
use db_iterator_pak;
									-- the B_trees representing the principal database components and their indices

var num_refcount_items_in_rec := rec_size/4 - 1;		-- number of refcount items in a record	

procedure create(my_to_string,file_name);		-- create or reopen a SETL database
	-- if file_name starts with a minus sign, we strip of the minus sign and start a new database in the indicated file
	-- otherwise we read the file, which should contain a previously written database, and use its contents to 
	-- initialize the database
	
	may_minus := match(file_name,"-"); 
	database_file_name  := file_name;		-- the global database_file_name is set from file_name 

	if may_minus = "" then		-- read the file and use its contents to initialize the database
		read_db(file_name); 	
		to_string := my_to_string?lambda(rec); return ""; end lambda;
		return;	
	end if;

	-- otherwise the database (and its representing trees) are initially empty.
	
	close(open(file_name,"TEXT-OUT"));   -- delete the file if it already exists; othewise create it

	to_string := my_to_string?lambda(rec); return ""; end lambda;
	-- by default (since a default is required), all records map to the null string
	
	record_string := dr_new_rec();	dr_load(record_string); 		-- we start with an empty string of records
	srgg:=dr_load(record_string);
	srgg(type_byte) := string_record; 
	dr_setrecbuf(record_string,srgg);
	dr_dirtify(record_string);
	
	record_index := dbix_create();		-- we start with an empty index of records; accumulates rec_id and length
						
	occ_rec_tup := dr_new_rec();	
	srgg:=dr_load(occ_rec_tup);
	srgg(type_byte) := wdoccs_string_record; 
	dr_setrecbuf(occ_rec_tup,srgg);
	dr_dirtify(occ_rec_tup);
										-- initially no word has any occurence.
										-- the occurences tuple is a list of record identifiers, divided into sections 
										-- by the cumulative counts held in 'words_index'. Within each such section
										-- the record identifiers are in order; they cumulate as strings and by count

	words_index := wix_create();			-- we start with an empty index of word occurences
		-- the words index has 2 cumulants, namely the words themselves, 
		-- accumulated using 'max', and the total length of the occurence list for each word

end create;

procedure identity_map(x); return x; end identity_map;		-- identity mapping

procedure self(key);		-- find a database record given its key
				-- the database record index call dbix_comp_cum2(rec,x) returns values of the form [rid,len,cum_len]

	--print("Self [record_index,key]=",[record_index,int_of_4(key)]);
	
	if (found := dbix_comp_cum2(record_index,key)) = OM then return OM; end if;  -- item might not be found
	
	[key_in,rec_len,stg_end] := found;

	--print("Self(key)... [key_in,rec_len,stg_end]=",
        --       [int_of_4(key_in),rec_len,stg_end]);
	
	if key_in /= key then return OM; end if;		-- no such key found
	--print(stg_end-rec_len+1," ",stg_end," Rec ",hexify(record_string));
	return unbinstr(bs_slice(record_string,stg_end - rec_len + 1,stg_end));		-- otherwise decode the record
	--return bs_slice(record_string,stg_end - rec_len + 1,stg_end);		-- otherwise decode the record

end;

procedure self(key) := rec;		-- update a database record using its key
	-- if rec = OM we delete the database record with the given key

--	print("Self:= [record_index,key]=",[record_index,int_of_4(key)]);
	[key_in,rec_len,stg_end] := dbix_comp_cum2(record_index,key);
--	print("Self(key):=rec ... [key_in,rec_len,stg_end]=",
--		[int_of_4(key_in),rec_len,stg_end]);
	
	if rec = OM then -- deletion is desired

		if key_in /= key then return; end if;
						-- key not present so no deletion needed

		being_deleted := unbinstr(bs_slice(record_string,strt := stg_end - rec_len + 1,stg_end));	
				-- the record being deleted
		deleted_wds := {wd: wd in breakup(to_string(being_deleted),"\n\t\r ") | wd /= ""};
					-- set of words being deleted
		delete_wds(deleted_wds,key);
				-- delete these word occurences from the words_index

		bs_set_slice(record_string,strt,stg_end,"");
				-- delete the record from the record_string 

		dbix_set_comp2(record_index,key,OM);
				-- delete the record data from the record_index

		words_just_changed := { };		-- we don't count deletions as changes

		return;		-- done with this case
		
	end if;		-- otherwise a record is being changed

	start_m1 := stg_end - rec_len;		-- the record will be written after this position	
	added_wds := {wd: wd in breakup(to_string(rec),"\n\t\r ") | wd /= ""};
					-- set of words being added
	words_just_changed := added_wds;
	
	if key_in /= key then 
						-- key not present, so just insert new record, using key
						-- Note: this operation should be quite rare
			-- get the string start of the record before which the insertion will 
			-- take place
		bs_set_slice(record_string,start_m1 + 1,start_m1,bstr := binstr(rec));
				-- insert the record into the record_string 
		dbix_set_comp2(record_index,key_in,[key,#bstr]);
				-- insert the record data into the record_index, immediately prior to key_in
		add_wds(added_wds,key);	-- add these word occurences to the words_index

		return;		-- done with this case

	end if;
	
		-- otherwise a record is actually being changed. Find the new words being
		-- added, and the old words being subtracted

	old_record := unbinstr(bs_slice(record_string,start_m1 + 1,stg_end));	
				-- the record as it was
	deleted_wds := {wd: wd in breakup(to_string(old_record),"\n\t\r ") | wd /= ""};
					-- set of words being deleted
	new_wds := added_wds - deleted_wds;	-- the new words to be indexed
	add_wds(new_wds,key);	-- add these word occurences to the words_index
--print("new_wds,key,words_index: ",new_wds," ",key," ",words_index);

	deleted_wds -:= added_wds;		-- set of words really being deleted
	delete_wds(deleted_wds,key);
				-- delete these word occurences from the words_index

	bs_set_slice(record_string,start_m1 + 1,stg_end,bstr := binstr(rec));
				-- change the record in the record_string 

	dbix_set_comp2(record_index,key,[key,#bstr]);
				-- change the record data in the record_index	

	words_just_changed := added_wds;

end;

procedure delete_wds(wd_list,key);	-- delete word occurences from the words_index
--if "D_4" in wd_list then 
--	print(what_tree," delete_wds - wd_list,key: ",wd_list," ",hexify(key)," ",wix_dump(words_index)," ",
--				bswo_length(occ_rec_tup)," ",hexify(occ_rec_tup)," ",hexify(stg_from_bswostg(occ_rec_tup))); 
--end if;
	for wd in wd_list loop	-- search over the words in order
						-- find index of 'wd' in the words_index
		
		[-,occs_len,up_limix] := wix_comp_cum2(words_index,wd);		-- get boundaries of word occurences section
		
		[key_in,key_ix] := wo_first_past_to(occ_rec_tup,key,up_limix - occs_len + 1,up_limix);
					-- key_in must be identical with key

		bswo_set_comp(occ_rec_tup,key_ix,OM);		-- delete the key occurence
		
			-- if the last occurence of the word has just been deleted, then delete 
			-- the word itself
		if occs_len = 1 then wix_set_comp2(words_index,wd,OM); continue; end if;
					-- otherwise reduce the number of occurences in the words_index by 1
		wix_set_comp2(words_index,wd,[wd,occs_len - 1]);

	end loop;
--if "D_4" in wd_list then 
--	print(what_tree," at end of delete_wds: ",wix_dump(words_index)," ",bswo_length(occ_rec_tup)," ",
--									hexify(occ_rec_tup)," ",hexify(stg_from_bswostg(occ_rec_tup)));
--end if;
	
end delete_wds;

procedure add_wds(wd_list,key);	-- insert word occurences into the words_index
--print(what_tree," add_wds - wd_list,key: ",wd_list," ",hexify(key)); 

	for wd in wd_list loop	-- search over the words in order
						-- find index of 'wd' in the words_index
--print("adding: ",wd," ",hexify(key));
		if (item:= wix_comp_cum2(words_index,wd)) = OM then	-- wd not bounded; insert it at end
			wix_insert(words_index,OM,[wd,1]);		-- there is just one occurence
--print("not bounded: ",hexify(occ_rec_tup)," ",hexify(stg_from_bswostg(occ_rec_tup)));
			bswo_insert(occ_rec_tup,OM,key);			-- insert the occurence at the end of occ_rec_tup 
			continue; 		-- done with this case
		end if;
		
		[wd_there,occs_len,up_limix]:= item; 
				 -- get wd in position, and boundaries of word occurences section

		if wd_there /= wd then		-- word not present; insert before word found
--print("not present: ",wd_there);
			wix_insert2(words_index,wd_there,[wd,1]);					-- insert: there is just one occurence
			bswo_insert(occ_rec_tup,up_limix - occs_len + 1,key);	-- insert the occurence before all the priors

			continue; 		-- done with this case

		end if;

			-- otherwise the word already occurs in some record; but the new 
			-- occurence must be inserted

		low_limix := up_limix - occs_len + 1;		-- we search for the key in the indicated range
		poss_occ_key_ix := wo_first_past_to(occ_rec_tup,key,low_limix,up_limix);
								
		if poss_occ_key_ix = OM then	-- there is no later occurence on record for this
										-- word, so it should be inserted at the end of
										-- the occurence list section to which it belongs
			bswo_insert(occ_rec_tup,up_limix + 1,key);	-- insert the key occurence at section end
--print("already occurs: ",wd," ",[wd,occs_len + 1]); 
			wix_set_comp2(words_index,wd,[wd,occs_len + 1]);  -- note that there is one additional occurence

			continue;		-- done with this case
		end if;
		
		[-,next_key_ix] := poss_occ_key_ix;

		bswo_insert(occ_rec_tup,next_key_ix,key);	-- insert the key occurence before the following occurence
		wix_set_comp2(words_index,wd,[wd,occs_len + 1]);  -- note that there is one additional occurence

	end loop;
--print(what_tree," at end of add_wds: ",wix_dump(words_index)," ",bswo_length(occ_rec_tup)," ",hexify(occ_rec_tup)," ",hexify(stg_from_bswostg(occ_rec_tup)));
	
end add_wds;

procedure self with rec;		-- add a new record to a database

			-- put the record into the database, and get its new key
	nrs := bs_length(record_string); bs_set_slice(record_string,nrs + 1,nrs,bsr := binstr(rec));

		-- insert the new key at the end of the record index, with its length 
		-- for cumulation

	dbix_insert(record_index,OM,[key := stg_of_4(identifier_generator +:= 1),#bsr]);

		-- now calculate the string of the record and find the set of words
	wd_list := {wd: wd in breakup(to_string(rec),"\n\t\r ") | wd /= ""};

	add_wds(wd_list,key);	-- add these word occurences to the words_index
	
	words_just_changed := wd_list;
	
	return self;	-- return the database itself as the result of this operation
end;

procedure contains(wd);
		-- returns the 'contains' iterator object for a given word
	-- tuple to be supplied is [iterator_kind = kind,occs_tree,current_occ_ptr = OM,start,end] (since simple);
	-- (but [kind,L1_child,L2_child,last_L1,last_L2] if compound)
--print("creating 'contains' iterator: ",wd);

	if (initial_item := wix_comp_cum2(words_index,wd)) = OM or initial_item(1) /= wd then 
				-- the word index call wix_comp_cum2(rec,x) returns values of the form [rid,len,cum_len]
--print("iteration will terminate");
		return contains_iterator([simple,occ_rec_tup]); -- word does not occur, so iteration will terminate
	end if;
	
	[-,occ_section_len,the_occs_end] := initial_item;
--print("iteration tuple  is: ",[simple,occ_rec_tup,OM,the_occs_end - occ_section_len + 1,the_occs_end]);
if the_occs_end > bswo_length(occ_rec_tup) then
	print("CREATING BAD CONTAINS_ITERATOR: ",the_occs_end - occ_section_len + 1," ",the_occs_end," ",
														bswo_length(occ_rec_tup));
	stop;
end if;
	return contains_iterator([simple,occ_rec_tup,OM,the_occs_end - occ_section_len + 1,the_occs_end]);

end contains;

procedure write_db();		-- write database to its file
	-- we only need to flush_all, write the refcount data to the file, and write one additional record
	-- containing (i) the location of the roots of the various database trees; (ii) the value of recno_generator;
	-- (iii) the location of the first record containing the refcount data; since the remainder of the
	-- database is already present in the file
        dr_flush_all();

	--stg := dr_load(rec_one := "\x00\x00\x00\x01");		-- initialize the first record, which contains the needed locator data
        -- Don't load the record!
        -- Instead we pass a string containing the parameters.

        stg:=" "*32;

	stg(1..4) := record_string;						-- bigstring containing records
	stg(5..8) := record_index;						-- index for bigstring containing records
	stg(9..12) := occ_rec_tup;						-- big tuple containing word occurences
	stg(13..16) := words_index;						-- index for big tuple containing word occurences
 	stg(17..21) := stg_of_5(identifier_generator);	-- identifier_generator value; this is 5 bytes

        -- Recno generator is handled internally in the native package
        -- we don't have to save it.

 	--stg(22..25) := stg_of_4(dr_recno_generator());		-- recno_generator value (maximum record used) 

	--refcount_data_start := dr_write_refcount_data();	-- write out the refcount data
	
 	--stg(26..30) := stg_of_4(refcount_data_start);				-- location of the first record containing the refcount data

	
	--dr_load(rec_one);
	--dr_setrecbuf(rec_one,stg);
	--dr_dirtify(rec_one);
 	
	--dr_flush_all();									-- write eveything out
        db_close_db(stg); -- close database
				
end write_db;

procedure read_db(file_name);		-- read database from the indicated file
	-- we only need to reconstruct refcount data, free_list, and in_use data from the file,
	-- and set (i) the location of the roots of the various database trees; (ii) the value of recno_generator;
	-- since the remainder of the database is already present in the file
	print("reading database");
        db_open_db(file_name);
	stg:=db_read_param();

	record_string := stg(1..4);						-- bigstring containing records
	record_index := stg(5..8);						-- index for bigstring containing records
	occ_rec_tup := stg(9..12);						-- big tuple containing word occurences
	words_index := stg(13..16);						-- index for big tuple containing word occurences
 	identifier_generator := int_of_5(stg(17..21));	-- identifier_generator value; this is 5 bytes

	--read_refcount_data(refcount_data_start,recno_generator);	-- reconstruct the refcount data
--print("read_db check: ",written_refcount(2..recno_generator) = refcount(2..recno_generator)," ",written_free_list = free_list," ",written_in_use = in_use);
--print(str(written_refcount)," ",str(refcount));
--print(str([hexify(x) : x in written_free_list])," ",str([hexify(x) : x in free_list]));
--print(str([hexify(x) : x in (written_in_use - in_use)])," ",str([hexify(x) : x in (in_use - written_in_use)]));

end read_db;

procedure write_refcount_data();	-- write out the refcount data
	-- we group the refcount data into a series of 4-byte fields within a chained list of blocs, 
	-- fitting these into free-list elements where possible, but otherwise using new records.
	-- each block contains a pointer to the next block in the chain.
	
	if recno_generator = 1 then return; end if; 		-- no recno data to write
	written_refcount := refcount; written_free_list := free_list; written_in_use := in_use;
	
--print("recno_generator: ",recno_generator," ",#refcount," ",str(refcount)," ",str([hexify(x): x in free_list])," ",str([hexify(x): x in in_use]));
	current_stg := dr_load(current_record := dr_new_rec());		-- allocate a first record to hold refcount data
	ref_data_start := current_record;
	place_in_current_record := 1;

	for j in [1..recno_generator] loop

		if (place_in_current_record +:= 4) > rec_size then		-- must write another record
			-- SPAM dirty with:= current_record;						-- note that current record has ween written
			next_record := current_stg(1..4) := dr_new_rec();		-- allocate another record to hold refcount data
			dr_load(current_record); 								-- prepare to write the next record
			dr_setrecbuf(current_record,current_stg);
			dr_dirtify(current_record);
--print("record to be written: ",hexify(current_record)," ",hexify(current_stg));
			current_stg := dr_load(current_record := next_record); -- start on the new string
			place_in_current_record := 5;						-- restart the write-position index
		end if;
		
		current_stg(place_in_current_record..place_in_current_record + 3) := stg_of_4(refcount(j)?0);
		
	end loop;
	
			dr_setrecbuf(current_record,current_stg);
			dr_dirtify(current_record);
--print("record to be written: ",hexify(current_record)," ",hexify(current_stg));

--print(hexify(current_record)," ",hexify(current_stg));	
	
	return ref_data_start;
	
end write_refcount_data;

procedure read_refcount_data(refcount_data_start,recno_generator);	-- reconstruct the refcount data
	-- we read the refcount data into a series of 4-byte fields withing a chained list of blocs, 
	-- fitting these into free-list elements where possible, but otherwise using new records.
	-- each block contains a pointer to the next block in the chain.
	
	free_list := []; in_use := { };			-- this routine also reconstructs the free_list tuple and in_use set
	
	place_in_current_record := 1;
	rno := int_of_4(refcount_data_start);
	gets(file_handle,(rno - 1) * rec_size + 1,rec_size,current_stg);		-- read the current record
--print("first record read: ",hexify(refcount_data_start)," ",hexify(current_stg));
	next_record := current_stg(1..4);						-- prepare to read the next record
	
	records_containing_refcounts := [refcount_data_start];		-- we need to collect these records, and free them

	--SPAM refcount := [1..recno_generator];			-- allocate a vector; its components will be over-written 
	
	for j in [1..recno_generator] loop

		if (place_in_current_record +:= 4) > rec_size then		-- must read next record
			place_in_current_record := 5; current_record := next_record;
			rno := int_of_4(current_record);
			gets(file_handle,(rno - 1) * rec_size + 1,rec_size,current_stg);		-- read the current record
--print("record read: ",hexify(current_record)," ",hexify(current_stg));
			records_containing_refcounts with:= current_record;		-- we need to collect these records, and free them
			next_record := current_stg(1..4);						-- prepare to read the next record
		end if;
		
		-- SPAM refcount(j) := rcj := int_of_4(current_stg(place_in_current_record..place_in_current_record + 3));
		
		if rcj = 0 then 				-- record belongs to free list
			free_list with:= stg_of_4(j);
		else							-- record is in use
			in_use with:= stg_of_4(j);
		end if;

	end loop;

	for rec in records_containing_refcounts loop		-- we free all these records, and put them on the free list
		spam:=0;
		-- SPAM refcount(int_of_4(rec)) := 0; free_list with:= rec; in_use less:= rec;
	end loop;
	
	free_list := [rc: rc in {rec : rec in  free_list | (ir := int_of_4(rec)) <= recno_generator and ir /= 1}];
	
end read_refcount_data;

procedure dump2(x);
		print("***** Record  ***** ",hexify(x)," ",hexify(dr_load(x))); 
end;

procedure dump();		-- dump of database
		-- we dump the database by iterating over its record index,
		-- and printing each successive record; and also over the words index,
		-- and printing the list of records containing occurences of each word
		
	print(what_tree," DATABASE RECORDS:");
--print("record_string: ",bs_length(record_string)," ",stg_from_bigstg(record_string));	
--print("occ_rec_tup: ",hexify(stg_from_bswostg(occ_rec_tup)));	
	cur_cum := 1;							-- cumulant value of current record
	ril := dbix_get_cum(record_index);		-- final cumulant value for whole records index
	
	bs_check_leaves(record_string);			-- check the record string for consistency

	while cur_cum < ril loop		-- loop over all records

		[rid,rid_len] := dbix_comp_cum(record_index,cur_cum);
		print("record ",int_of_4(rid),": [",cur_cum,"..",cur_cum + rid_len - 1,"] ",
								unbinstr(bs_slice(record_string,cur_cum,cur_cum + rid_len - 1)));
		cur_cum +:= rid_len;			-- advance to the next record

	end loop;
	
	print("INDEXED WORDS AND WORD OCCURENCES:");
	cur_cum := 1;							-- cumulant value of current word
	wil := wix_get_cum(words_index);		-- final cumulant value for whole words index

	while cur_cum <= wil loop		-- loop over all words

		[word,num_occs,word_cum] := wix_comp_cum(words_index,cur_cum);
		word_tup := [int_of_4(bswo_comp(occ_rec_tup,j)): j in [cur_cum..word_cum]];		-- tuple of word occurences
		
		print(word,": (",cur_cum,"..",word_cum," )",word_tup);
		cur_cum := word_cum + 1;
	
	end loop;
	
	print();
	
	return "";
	
end dump;

procedure check_consistency(caption);		-- miscellaneous consistency checks (DEBUGGING ONLY)
	
	if bswo_length(occ_rec_tup) < wix_get_cum(words_index) then
		print(caption," DATABASE INCONSISTENCY: OCCURENCE STRING LENGTH IS LESS THAN TOTAL WORD INDEX CUMULANT ",
										hexify(stg_from_bswostg(occ_rec_tup)),"\n",wix_dump(words_index)); stop;
	end if;
	
end check_consistency;

procedure dump_recstring();		-- dump of database record_string (DEBUGGING ONLY)
	print(bnr_dump(record_string));
end dump_recstring;

end database;

class active_database;		-- 'active' database class, with hibernators

procedure create(my_to_string,my_do_transaction,file_name);		-- create or reopen an active database
procedure write_act();							-- write database to its file
procedure contains(wd);
		-- returns the 'contains' iterator object for a given word
procedure watch(Tid,word_list,transaction_state);
						-- 'watch' routine, which causes a hybernator to sleep, watching a list of words

procedure dump();							-- dump database (DEBUGGING)

end active_database;

class body active_database;		-- 'active' database class, with hibernators
	use setldb,database;			-- use the standard database class; there will be two of these
	use byteutil,disk_records_pak,db_records,db_iterator_pak;	
	use string_utility_pak;
	
	var inner_database,hibernators;		-- the actual database, and the hibernators watching and writing it
	var in_exec_loop := false;			-- flag indicating whether or not we are in the execution loop of 'alert_hibs'
	var line_to_stg,do_transaction; 	-- the two procedural parameters passed to 'create'
	
procedure create(my_to_string,my_do_transaction,file_name);		-- create or reopen an active database
		-- note that the following lines will read and existing database pair unless file_name starts with '-'
	
	line_to_stg := my_to_string; do_transaction := my_do_transaction;		-- save the procedure parameters
	
	inner_database := database(line_to_stg,file_name + ".db");	-- create an empty database, or read
	inner_database.what_tree := "inner";		-- for debugging
	hibernators := database(hiber_to_stg,file_name + ".hib");		-- create an empty hibernators database, or read
	hibernators.what_tree := "hibernators";		-- for debugging

end create;

procedure hiber_to_stg(tup);		-- convert a hibernator's state tuple to an index string 
	-- there are several cases. For 'newborn' hibernators, the tuple is [word_watched_for]  
	-- For 'hibernating' hibernators, the tuple is [word_watched_for,state].
	-- For active hibernators, the tuple is [".",wakeup_wd,state]
	-- For the currently executing hibernator (if any),  the tuple is ["",wakeup_wd,state]
	
	return tup(1);		-- we simply return the first component 

end hiber_to_stg;

procedure write_act();							-- write an active database to its file

	inner_database.write_db(); hibernators.write_db();		-- write both parts
	
end write_act;

procedure dump();							-- dump database (DEBUGGING)
	print("INNER_DATABASE"); inner_database.dump(); print("HIBERNATORS"); hibernators.dump();
end dump;

procedure contains(wd);					-- returns the 'contains' iterator object for a given word

	if is_tuple(wd) then return hibernators.contains(wd); end if;		-- iterate over the hibernators

	return inner_database.contains(wd);	-- iterate over the inner database

end contains;

procedure self(key);		-- find a database record or hibernator record given its key

	if is_tuple(key) then return hibernators(key(1)); end if;		-- reference the hibernators

	return inner_database(key);	-- get it from the inner database

end;

procedure self(key) := rec;		-- update a database record or hybernator record using its key
--print("assigning record: ",hexify(key)," ",rec);
	if is_tuple(key) then			-- assignment to the hibernators
		hibernators(key(1)) := rec; hibernators.check_consistency("hibernators"); return self; 
	end if;
			

	inner_database(key) := rec;				-- write the inner database
	inner_database.check_consistency("inner_database");
	alert_hibs();							-- alert the hibernators, which may modify the inner database

end;

procedure self with rec;		-- add a new record or hybernator to a database
		-- for external use, we encode additons to the hibernators by wrapping them into singleton sets
--print("new record: ",rec);
	if is_set(rec) and #rec = 1 then 		-- addition to the hibernators
		hibernators with:= arb(rec); hibernators.check_consistency("hibernators"); return self; 
	end if;

	inner_database with:= rec;				-- write the inner database
	inner_database.check_consistency("inner_database");
--print("after insertion of new record: "); inner_database.dump();
	alert_hibs();							-- alert the hibernators, which may modify the inner database

	return self; 
	
end;

procedure alert_hibs();		-- alert the hibernators, which may modify the inner database
	
	words_alerted := inner_database.words_just_changed;		-- words affected by the last operation
	
	for wd in words_alerted loop			-- alert all hibernators watching this word
		for hib in hibernators.contains(wd) loop
			hibernators(hib) := [".",wd,hibernators(hib)(2)];		-- those hibernating have '.' as first component
						-- save only the alerting word and the state, not the list of words being watched
		end loop;
	end loop;
	
	if in_exec_loop then return; end if;		-- to eliminate useless recursion

	in_exec_loop := true;		-- set the execution loop flag
	
	while (first_active := arb(hibernators.contains("."))) /= OM loop
					-- hibernators potentially active have '.' as first component

		hibernators(first_active) := [""] + hibernators(first_active)(2..);
				-- the hibernator which is executing has "" as first component, so is not sensitive to any word change 
				-- i.e. now hibernators(first_active) is ["",wd,transaction_state]
--print("activating: ",hexify(first_active)," ",hibernators(first_active)," ",hibernators.dump()); 
		
--		incref(old_inner_database := inner_database,1); incref(old_hibernators := hibernators,1);
																			-- note two more references
		modified_db := do_transaction(first_active,self);
					-- and call the transaction (cautiously); this will modify a copy of the database,
					-- but not the hibernators

		inner_database := modified_db.inner_database; hibernators := modified_db.hibernators; 
			-- copy the components of the modified database
--		incref(old_inner_database,-1); incref(old_hibernators,-1);			-- drop these two references
		
	end loop;

	in_exec_loop := false;		-- drop the execution loop flag

end alert_hibs;

procedure watch(Tid,word_list,transaction_state);		-- 'watch' routine, which causes a hybernator to sleep,
														-- watching a list of words
	-- As the Tid of a transaction (i.e. hibernator), it is best to use its record id.

		-- When the transaction is active but not running, its record is [".",transaction_state],
		-- and it is simply indexed by ".". When it is hibernating, and watching the words of word_list,
		-- it is indexed by all the words in word_list. To find all the transactions hibernating on the word wd,
		-- we examine the iterators 
	
		--					rid in in hibernators.contains(wd)
		
		--  and execute
		
		--	for rid in in hibernators.contains(wd) loop db(rid) := [".",[wd],hibernators(hib)(2)] end loop;
		
		-- which makes the transaction potentially active.
		-- To establish a 'watch', we simply execute
--print("watch: ",hexify(Tid)," ",word_list," ",transaction_state);	
	hibernators(Tid) := [word_list(1),transaction_state];		-- this drops the hibernator from active state
				-- a hibernator which is watching has the (unique) word for which it is watching as its first component
				
	return self;		-- return the database as it now is
	
end watch;

end active_database;

program test;		-- test program for SETL database class

	use database;		-- use the SETL database class
	use string_utility_pak;		

	db := database(pair_to_stg,"-disk_record_file");		-- create an empty database, to store test pairs

	db with:= ["Jack",68];	--print("first pair added\n",db.dump());		-- add a first pair
	db with:= ["Diana",41];	--print("second pair added\n",db.dump());		-- add a second pair
	db with:= ["Rachel",41]; --print("third pair added\n",db.dump());	-- add a third pair
	
	print("database with 3 records",db.dump());
	
	print("testing arb: list a record containing 'Jack': ",db(jack_key := arb(db.contains("N:Jack"))));
	print("testing arb: list a record containing 'Diana': ",db(arb(db.contains("N:Diana"))));
	print("testing arb: list a record containing 'A:68': ",db(arb(db.contains("A:68"))));
	print("testing arb: list a record containing 'A:41': ",db(arb(db.contains("A:41"))));
	print("testing arb: count records containing 'A:41': ",#db.contains("A:41"));

	print("testing change of record, set ['Jack',68] to ['Jack',69]");
	db(jack_key) := ["Jack",69];	-- another year gone by
	print(db.dump());
	print("testing arb after change: list a record containing 'N:Jack': ",db(arb(db.contains("N:Jack"))));
	print("testing arb after change: list a record containing 'A:68': ",a_rec := arb(db.contains("A:68"))," ",
		if a_rec /= OM then db(a_rec) else "Non-existent ****" end if);
	print("testing arb after change: list a record containing 'A:69': ",db(arb(db.contains("A:69"))));
	print("testing arb after change: list a record containing 'A:41': ",a_rec := arb(db.contains("A:41"))," ",if a_rec /= OM then db(a_rec) 
			else "Non-existent ****" end if);
	print("testing arb after change: list a record containing 'N:Diana': ",db(arb(db.contains("N:Diana"))));

			-- now start testing the basic iterators
	print("testing the basic iterators: list record keys containing 'Jack'");
	for key in (jack_recs := db.contains("N:Jack")) loop print(hexify(key)); end loop;
	print("testing the basic iterators2: list records containing 'Jack'");
	for key in (jack_recs := db.contains("N:Jack")) loop print(db(key)); end loop;
	print("testing the basic iterators3: list records containing 'Diana'");
	for key in (jack_recs := db.contains("N:Diana")) loop print(db(key)); end loop;
	print("testing the basic iterators4: list records containing '41'");
	for key in (jack_recs := db.contains("A:41")) loop print(db(key)); end loop;

	print("\n****** Union test - records containing 'Jack' or 'Diana' ******");

	for key in (jd_recs := (db.contains("N:Jack") + db.contains("N:Diana"))) loop 
	 	print(hexify(key)," ",db(key));
	end loop;

	print("\n****** Union test - records containing 'Rachel' or 'Diana' ******");

	for key in (jd_recs := (db.contains("N:Rachel") + db.contains("N:Diana"))) loop 
	 	print(hexify(key)," ",db(key));
	end loop;

	print("testing arb for union: list a record containing 'Jack' or 'Diana': ",key := arb jd_recs," ",db(key));

	print("\n****** Intersection test - records containing '41' and 'Diana' ******");

	for key in (jd_recs := (db.contains("A:41") * db.contains("N:Diana"))) loop 
	 	print(hexify(key)," ",db(key));
	end loop;

	print("\n****** Intersection test - records containing '41' and 'Rachel' ******");

	for key in (jd_recs := (db.contains("A:41") * db.contains("N:Rachel"))) loop 
	 	print(hexify(key)," ",db(key));
	end loop;

	print("testing arb for Intersection: list a record containing '41' and 'Rachel': ",key := arb jd_recs," ",db(key));

	print("\n****** Difference test - records containing '41' andnot  'Diana' ******");

	for key in (jd_recs := (db.contains("A:41") - db.contains("N:Diana"))) loop 
	 	print(hexify(key)," ",db(key));
	end loop;

	print("\n****** Difference test - records containing '41' andnot  'Rachel' ******");

	for key in (jd_recs := (db.contains("A:41") - db.contains("N:Rachel"))) loop 
	 	print(hexify(key)," ",db(key));
	end loop;
	
	db with:= ["Jack", 6];				-- add a younger jack
	
	print("testing the basic iterators5: list records containing '6'");
	for key in (jack_recs := db.contains("A:6")) loop print(db(key)); end loop;
	
	print("****** Print all records containing 'Jack' ******");
	for key in (jack_recs := db.contains("N:Jack")) loop print(db(key)); end loop;
	print("recs with Jack and '69': ",hexify(jack_rec69 := arb(db.contains("N:Jack") * db.contains("A:69"))));
	if jack_rec69 /= OM then print(db(jack_rec69)); end if;

	print("recs with Jack and '6': ",hexify(jack_rec6 := arb(db.contains("N:Jack") * db.contains("A:6"))));
	if jack_rec6 /= OM then print(db(jack_rec6)); end if;

	print("\n****** Difference test ******");
	print("recs with Jack but not '6'");
	for x in (jnd_recs := db.contains("N:Jack") - db.contains("A:6")) loop
		print(db(x)); 
	end loop;

	print("\n****** Intersection test ******");
	print("recs with Jack and '6'");
	for x in (jnd_recs := db.contains("N:Jack") * db.contains("A:6")) loop
		print(db(x)); 
	end loop;

	print("\n****** Deleting a record -- the 'Jack' with A:69 ******");
	jack_rec69 := jack_key := arb(db.contains("N:Jack") * db.contains("A:69")); db(jack_rec69) := OM;
	print(db.dump());
	
	print("remaining recs with Jack");
	for x in (jack_recs := db.contains("N:Jack")) loop print(db(x)); end loop;

	print("\n****** Restoration Test - Restoring ['Jack',68]; plus Difference Test ******");

	db with:= ["Jack",68];		-- the other Jack again
	print(db.dump()); print("Records with 'Jack' but not 'Diana'");
	for x in (jnd_recs := db.contains("N:Jack") - db.contains("N:Diana")) loop
		print(db(x)); 
	end loop;

	print("\n****** Deleting another record -- the 'Jack' with A:6 ******");
	print("Record identifier is: ",hexify(lit_jack_rec := arb(db.contains("N:Jack") * db.contains("A:6")))); 
	print("Deleting ",db(lit_jack_rec));
	db(lit_jack_rec) := OM;
	print(db.dump()); print("Records with 'Jack'");
	for x in (jack_recs := db.contains("N:Jack")) loop print(db(x)); end loop;

	for j in [1..20] loop 
		db with:= ["Jack",j]; db with:= ["Diana",j]; db with:= ["Rachel",j];
	end loop;
	print("pairs added\n",db.dump());

	print("records with 'A:3'");
	print(str([db(x): x in (jnd_recs := db.contains("A:3"))])); 
	print("records with 'N:Diana'");
	print(str([db(x): x in (jnd_recs := db.contains("N:Diana"))])); 
	print("records with 'A:3' but not 'N:Diana'");
	print(str([db(x): x in (jnd_recs := db.contains("A:3") - db.contains("N:Diana"))])); 
	print("records with 'N:Diana' but not 'A:3'");
	print(str([db(x): x in (jnd_recs := db.contains("N:Diana") - db.contains("A:3"))])); 
	print("records with 'N:Diana' and 'A:3'");
	print(str([db(x): x in (jnd_recs := db.contains("N:Diana") * db.contains("A:3"))])); 
	print("records with 'N:Diana' or 'A:3'");
	print(str([db(x): x in (jnd_recs := db.contains("N:Diana") + db.contains("A:3"))])); 

	print("\n Memory Leak Tests"); 
print("TO BE ADDED"); 

	print("\n Database Copy Tests"); 
print("TO BE ADDED"); 
	
	print("\n ***********END OF TESTS ***********"); 
	db.write_db();
stop;
	
	procedure pair_to_stg(p);	-- convert name, age pair to string

		[nm,age] := p; return " N:" + nm + " A:" + str(age);

	end pair_to_stg;

end test;
		
	-- Distributed versions of the proposed databases can be organized as sets of records
	-- maintained independently on multiple computers, each of which stores a section 
	-- of the big_string representing the database, and an index to the words 
	-- in this section of the big_string. parallel searches over these 
	-- separate sections generate sets, mustly null, which are combined into 
	-- larger sets. The search can be optimized by storing hints of the form
	-- wd:* on the processors which distribute the queries. If no word of this 
	-- form is present on a processor P when a search for a string starting in this way is 
	-- being distributed, P need not receive a copy of the query.

	
	-- if the free_list is empty when a section is requested, a 'space exhausted' 
	-- abort results.
	
program test2;		-- test program for SETL database class

	use database;		-- use the SETL database class
	use string_utility_pak;		

	db := database(pair_to_stg,"-disk_record_file");		-- create an empty database, to store test pairs

	db with:= ["King",212,"5001317"];
	db with:= ["Queen",718,"2681702"];
	db with:= ["Slave",212,"9983470"];
	
	print("database with 3 records",db.dump());
	stop;
	
stop;
	
	procedure pair_to_stg(p);	-- convert name, age pair to string

		[nm,ac,ph] := p; 
	return " N_" + nm + " AC_" + str(ac)+" P_"+ph;

	end pair_to_stg;

end test2;
Software Preservation Group

Sections

Personal tools

Document Actions

DB_database.stl