DB_database.stl
The main big strings and database implementation, with test.
--files: string_utility_pak.stl,sort_pak.stl,get_lines_pak.stl,random_pak.stl -- disk_records_refct.stl, B_tree_for_bigs_refct.stl, B_tree_for_bigs_wdix.stl, B_tree_for_bigs_wdoc.stl, -- B_tree_for_bigs_dbix.stl, Big_stg_for_wdoc.stl --file database_top.stl test_bs package big_string_pak; const bs_code_pts := {"copy", "simple", "optimized", "chop", "tree", "concat_join", "concat_balance", "concat_chop", "nocon_opt", "nocon", "nocon_left", "nocon_right", "nocon_iter", "nocon_del", "nocon_ins", "nocon_to_simp"}; -- code points to be traversed procedure bs_from_stg(stg); -- create a big string from a string procedure stg_from_bigstg(rec); -- make a string from a big_string procedure bs_slice(rec,i,j); -- the slice extraction operation procedure bs_length(rec); -- length of a bigstring procedure bs_set_slice(rw rec,i,j,stg); -- the slice assignment operation -- (PUBLIC FOR DEBUGGING ONLY) procedure stg_past(j,rec); -- make a string from a the part of a big_string from j onward procedure stg_till(j,rec); -- make a string from a the part of a big_string up to j procedure chop_up(first,mid,last); -- chop the concatenation of three strings into a tuple of string records procedure print_raw(); -- raw print for debugging procedure bnr_voc(rec,j); -- j'th member of vector of children for B_tree_for_bigstring records procedure bs_check_leaves(rec); -- leaf consistency check end big_string_pak; package body big_string_pak; use setldb,byteutil,disk_records_pak,db_records,B_tree_for_bigstring,string_utility_pak; procedure bs_from_stg(stg); -- make a big_string from a string -- we first chop up the string into a tuple of string records if #(pieces_tup := chop_up("",stg,"")) = 1 then return make_record(pieces_tup(1)); end if; -- see comment on the 'chop_up' routine, below. If the whole string fits in -- just one string record, we simply return that record. --print("#pieces_tup: ",#pieces_tup); --print("Total len ",+/[#y:y in pieces_tup]); return bnr_make_from_tuple(pieces_tup); -- otherwise convert the list of sections returned into a tree end bs_from_stg; procedure stg_from_bigstg(rec); -- make a string from a big_string if dr_load(rec)(type_byte) = string_record then return sr_slice(rec,1,sr_length(rec)); end if; -- get string from record if not dr_is_compound(rec) then -- concatenate strings from records return "" +/ [sr_slice(ch,1,sr_length(ch)): j in [1..num_childr(rec)] | (ch := bnr_voc(rec,j)) /= OM]; end if; return "" +/ [stg_from_bigstg(bnr_voc(rec,j)): j in [1..num_childr(rec)]]; -- else proceed recursively end stg_from_bigstg; procedure stg_past(j,rec); -- make a string from a the part of a big_string from j onward if dr_load(rec)(type_byte) = string_record then -- get string from record if j > (sl := sr_length(rec)) then abort("Illegal character index in simple stg_past operation: " + str(j) + ", " + sl); end if; return sr_slice(rec,j,sl); -- return slice from j onward end if; -- get the child containing the first character past j if not (exists chix in [1..nc := num_childr(rec)] | bnr_get_ch_cum(rec,chix) >= j) then abort("Illegal character index in stg_past operation: " + str(j) + ", " + nc); end if; prev_cum := if chix = 1 then 0 else bnr_get_ch_cum(rec,chix - 1) end if; -- the previous cumulant return stg_past(j - prev_cum,bnr_voc(rec,chix)) +/ [stg_from_bigstg(bnr_voc(rec,j)): j in [chix + 1..nc]]; end stg_past; procedure stg_till(j,rec); -- make a string from a the part of a big_string up to j if dr_load(rec)(type_byte) = string_record then -- simple string record case if j > (sl := sr_length(rec)) then abort("Illegal character index in simple stg_past operation: " + str(j) + ", " + sl); end if; return sr_slice(rec,1,j); -- return slice up to j end if; -- get the child containing the first character past j if not (exists chix in [1..nc := num_childr(rec)] | bnr_get_ch_cum(rec,chix) >= j) then abort("Illegal character index in stg_past operation: " + str(j) + ", " + nc); end if; prev_cum := if chix = 1 then 0 else bnr_get_ch_cum(rec,chix - 1) end if; -- the previous cumulant return ("" +/ [stg_from_bigstg(bnr_voc(rec,j)): j in [1..chix - 1]]) + stg_till(j - prev_cum,bnr_voc(rec,chix)); end stg_till; procedure bs_slice(rec,i,j); -- the slice extraction operation concatenates all the characters -- between the two indicated positions. the 'node' can be either -- the top of a tree, or can be a simple string_record -- for a tree, we locate the sections F, L containing the first and last characters, and -- return the concatenation of the part of F past i, the part of L up to j, -- and the concatenation of the strings associated with all intermediate child trees --print("bs_slice: ",i," ",j); if i < 1 or j < i - 1 then abort("Illegal second and/or first parameters in string extraction operation" + str(j) + ", " + str(i)); end if; if dr_load(rec)(type_byte) = string_record then -- simple string record case sl := sr_length(rec); if j > sl then -- get string from record print("bs_slice: Illegal character index in simple string extraction operation: " + str(j) + ", " + str(sl)); stop; end if; --print("bs_slice bottom: ",i," ",j," ",sr_slice(rec,i,j)); return sr_slice(rec,i,j); end if; -- get the child containing the first character past i if not (exists chixi in [1..nc := num_childr(rec)] | bnr_get_ch_cum(rec,chixi) >= i) then abort("Illegal first character index in string extraction operation: " + str(i) + ", " + str(nc)); end if; -- get the child containing the first character past j if not (exists chixj in [1..nc := num_childr(rec)] | bnr_get_ch_cum(rec,chixj) >= j) then abort("Illegal second character index in string extraction operation: " + str(j) + ", " + str(nc)); end if; prev_cumi := if chixi = 1 then 0 else bnr_get_ch_cum(rec,chixi - 1) end if; -- the cumulant previous to i prev_cumj := bnr_get_ch_cum(rec,chixj - 1); -- the cumulant previous to j if chixi = chixj then -- proceed recursively return bs_slice(bnr_voc(rec,chixi),i - prev_cumi,j - prev_cumi); end if; xxx:=(stg_past(i - prev_cumi,bnr_voc(rec,chixi)) +/[stg_from_bigstg(bnr_voc(rec,k)): k in [chixi + 1..chixj - 1]]) + stg_till(j - prev_cumj,bnr_voc(rec,chixj)); -- proceed recursively: from, middle, end return xxx; return (stg_past(i - prev_cumi,bnr_voc(rec,chixi)) +/[stg_from_bigstg(bnr_voc(rec,k)): k in [chixi + 1..chixj - 1]]) + stg_till(j - prev_cumj,bnr_voc(rec,chixj)); -- proceed recursively: from, middle, end end bs_slice; procedure bnr_voc(rec,j); -- j'th member of vector of children for B_tree_for_bigstring records stg := dr_load(rec); -- load this string cjstrt := (j - 1) * 4 + bnr_ch_start; return stg(cjstrt..cjstrt + 3); -- return child rec end bnr_voc; procedure bnr_get_ch_cum(rec,j); -- get the cum value for the j-th child of this node return int_of_5(dr_load(rec)(bnr_cum_start + (j - 1) * 5..bnr_cum_start + j * 5 - 1)); end bnr_get_ch_cum; procedure chop_up(first,mid,last); -- chop the concatenation of three strings into a tuple of string records -- if all three pieces fit into one, two or three sections, return these. Otherwise join as much as possible of the -- middle into two pieces, and then the reminder of the middle into approximately equal-sized pieces. -- first and lat are assumed to be no more than one record size each if (total := (nf := #first) + (nm := #mid) + (nl := #last)) <= sr_hi_lim then -- put all into one piece return [first + mid + last]; end if; if total < 2 * sr_hi_lim then -- two pieces are enough center := total/2; -- half the total if center <= nf then return [first(1..center), first(center + 1..) + mid + last]; end if; if center <= nf + nm then return [first + mid(1..used := center - nf), mid(used + 1..) + last]; end if; return [first + mid + last(1..used := center - nf - nm), last(used + 1..)]; end if; -- otherwise first + mid cannot fit into one piece if total <= 3 * sr_hi_lim then -- if they fit into 3 pieces, put as much as possible -- of first and mid into one piece, and chop up the remainder return [first + mid(1..used := sr_hi_lim - nf)] + chop_up("",mid(used + 1..),last); end if; -- otherwise put as much as possible of mid together with first and last, and -- cut the reminder of the middle into approximately equal-sized pieces. first := first + mid(1..used := sr_hi_lim - nf); last := mid((new_end := nm - (sr_hi_lim - nl)) + 1..) + last; npieces := (rem := new_end - used)/sr_hi_lim; -- number of full-sized pieces psm1 := sr_hi_lim - 1; if (extra_part := rem mod sr_hi_lim) = 0 then -- fits into a list of full pieces return [first] + [mid(j..j + psm1): j in [used + 1,used + sr_hi_lim + 1..new_end - psm1]] + [last]; else -- there is a bit left over start_last_2 := new_end - psm1 - extra_part; -- starting character of last 2 sections mid_last_2 := start_last_2 + (psm1 + extra_part)/2; -- middle character of last 2 sections -- use list of all but 1 full piece, plus two smaller pieces. return [first] + [mid(j..j + psm1): j in [used + 1,used + sr_hi_lim + 1..new_end - 2 * psm1 - extra_part]] + [mid(start_last_2..mid_last_2),mid(mid_last_2 + 1..new_end)] + [last]; end if; end chop_up; procedure make_record(stg); -- creates one record from string rec := set_type(dr_new_rec(),string_record); sr_set_slice(rec,1,0,stg); return rec; end make_record; procedure pass_pass(stg); print(stg); pass(stg); end pass_pass; -- print and pass for debugging procedure bs_set_slice(rw rec,i,j,stg); -- the slice assignment operation --print("bs_set_slice: ",i," ",j," ",#stg," ",abs(dr_load(rec)(type_byte))); if i < 1 or j < i - 1 then abort("Illegal second and first parameters in string extraction operation" + str(j) + ", " + str(i)); end if; -- since rec will change, ensure that it has just one copy. -- This will up the refcounts of its children if necessary if refcount(int_of_4(rec)) > 1 then -- must copy pass("copy"); stgg := dr_load(rec); new_r := dr_new_rec(); dr_setrecbuf(new_r,stgg); dr_dirtify(new_r); increfs(new_r,1); incref(rec,-1); rec := new_r; -- substitute copy for original end if; if (contents := dr_load(rec))(type_byte) = string_record then -- simple string record case -- in the simple string record case we extract the un-written-over portions of the string, -- and chop up F + stg + L to get a new list of leaves, which are made into a node or a tree. pass("simple"); --print("simple: ",i," ",j," ",#stg); if j > (sl := sr_length(rec)) then -- get string from record abort("bs_set_slice: Illegal character index in simple string extraction operation: " + str(j) + ", " + str(sl)); end if; -- if F + stg + L fits in just one section, we perform the operation as a string slice assignment, -- to avoid string copying where unnecessary if (new_len := (i - 1) + (lstg := #stg) + (sl - j)) <= sr_hi_lim then -- handle as an optimized case pass("optimized"); sr_set_slice(rec,i,j,stg); -- replace slice --print("optimized case: ",i," ",j," ",sr_slice(rec,1,sr_length(rec))); return; -- done with this optimized case else -- we must actually chop up F + stg + L pass("chop"); first := contents(sr_char_start..sr_char_start + i - 2); -- till the i-1-th character last := contents(sr_char_start + j..sr_char_start + sl - 1); -- from the j + 1-st character pieces_tup := chop_up(first,stg,last); end if; incref(rec,-1); -- the old form of the record will lose one reference rec := bnr_make_from_tuple(pieces_tup); -- convert the list of sections returned into a tree return; end if; -- otherwise we deal with the B-tree case pass("tree"); if i > (len_this := bnr_get_cum(rec)) + 1 or j > len_this then abort("Out of range first parameter or second in string assignment operation " + str(j) + ", " + str(i) + " Length is " + str(len_this)); end if; if i = len_this + 1 then -- we have a simple concatenation operation if (lstg := #stg) < sr_low_lim then -- balance or join with last node [last_node,lcum] := bnr_comp_cum(rec,OM); -- get the last node, and its string contents ln_stg := sr_slice(last_node,1,lln := sr_length(last_node)); if lln + lstg <= sr_hi_lim then -- join with last node pass("concat_join"); old_last_node := last_node; incref(old_last_node,1); -- must increment to prevent length of referenced node from changing -- ********** TOTO: FIX THIS IN YOUR BIG_STRING CODE ********** sr_set_slice(last_node,lln + 1,lln,stg); -- last_node is copied and old_last_node loses a ref bnr_set_comp(rec,lcum,last_node); -- insert the (new) last node; old_last_node loses another ref incref(last_node,-1); -- the variable 'last_node' is now dead return; -- done with this optimized case end if; -- otherwise we will balance pass("concat_balance"); old_last_node := last_node; incref(old_last_node,1); -- must increment to prevent length of referenced node from changing -- ********** TOTO: FIX THIS IN YOUR BIG_STRING CODE ********** [new_ln_stg,stg] := chop_up(ln_stg,stg,""); -- cut into two string sections sr_set_slice(last_node,1,lln,new_ln_stg); bnr_set_comp(rec,lcum,last_node); -- insert the (possibly new) last node incref(last_node,-1); -- the variable 'last_node' is now dead bnr_insert(rec,OM,leaf := make_record(stg)); -- insert extra part at the end incref(leaf,-1); -- the variable 'leaf' is now dead return; -- done with this optimized case end if; pass("concat_chop"); pieces_tup := chop_up("",stg,""); --print("concatenation: ",pieces_tup); for piece in pieces_tup loop -- append all these pieces to the existing record bnr_insert(rec,OM,leaf := make_record(piece)); -- at the end incref(leaf,-1); -- the variable 'leaf' is now dead end loop; -- note that the first insert may copy rec and decrement its refcount if it is shared; -- in this case, ref will correctly be repaced by the new copy return; -- done with this case end if; -- end of the case in which insertion is at the very end -- otherwise we need to make an insertion into the middle of an existing big_string -- we extract the un-written-over portions F and L of the first and last sections affected, -- and chop up F + stg + L to get a new list of leaves. These replace the leaves -- of the original tree, running from the first to the last section affected. -- get the child containing the first character past i [leafi,cumi] := bnr_comp_cum(rec,i); -- get the leaf containing the first character past i, and its cumulant [leafj,cumj] := bnr_comp_cum(rec,j); -- get the leaf containing the first character past j, and its cumulant --print("insertion in middle: ",cumi," ",cumj); -- if cumi = cumj, and F + stg + L fits in just one section and does not fall beneath the required minimum -- size, we perform the operation as a string slice assignment, to avoid string copying where unnecessary if cumi = cumj -- we may be able to work in an existing section and (new_len := (lstg := #stg) + (sli := sr_length(leafi)) - (j - i + 1)) <= sr_hi_lim and new_len >= sr_low_lim then pass("nocon_opt"); prev_cum := cumi - sli; -- cumulated length before the i-th (= j-th) leaves old_leafi := leafi; incref(old_leafi,1); -- must increment to prevent length of referenced node from changing -- ********** TOTO: FIX THIS IN YOUR BIG_STRING CODE ********** sr_set_slice(leafi,i - prev_cum,j - prev_cum,stg); -- replace slice; leafi is copied, so its old version will lose 1 ref bnr_set_comp(rec,i,leafi); -- reset set the leaf of rec, since it will have changed -- the old version now loses another ref incref(leafi,-1); -- the variable leafi is now dead (compensate for extra ref added by set_comp) return; -- done with this optimized case end if; contentsi := dr_load(leafi); sli := sr_length(leafi); irel := i - (cumi - sli); first := contentsi(sr_char_start..sr_char_start + irel - 2); -- till the i-1-th character contentsj := dr_load(leafj); slj := sr_length(leafj); jrel := j - (cumj - slj); last := contentsj(sr_char_start + jrel..sr_char_start + slj - 1); -- from the j + 1-st character pieces_tup := chop_up(first,stg,last); -- now the strings in pieces_tup must be turned into leaves which replace the leaves of the tree, -- starting from leafj and ending with leafi. This iteration uses the cumulants and works backwards, -- testing for the appearance of leafi. If this is encountered before pieces_tup is exhausted, the -- reminming elements of pieces_tup are inserted at the appropriate position. If pieces_tup -- is exhausted before leafi is encountered, the leaves forward from the one replaced up -- to and including leafi is deleted. -- however, the case in which pieces_tup contins just one element may be special, since this may fall below the -- minimum number of characters wanted for a leaf pass("nocon"); if #pieces_tup = 1 and #(all := pieces_tup(1)) < sr_low_lim then -- we balance or join the short string we have with any left or right neighbor leaf if cumi > sli then -- there is a left neighbor pass("nocon_left"); [left_neighbor,-] := left_pair := bnr_comp_cum(rec,cumi - sli); -- get the left neighbor and its cumulant left_neighbor_stg := sr_slice(left_neighbor,1,sr_length(left_neighbor)); pieces_tup := chop_up(left_neighbor_stg,all,""); -- use this as the new pieces_tup [leafi,cumi] := left_pair; -- use left_neighbor as the leafi elseif cumj < len_this then -- there is a right neighbor pass("nocon_right"); [right_neighbor,-] := right_pair := bnr_comp_cum(rec,cumj + 1); -- get the right neighbor and its cumulant right_neighbor_stg := sr_slice(right_neighbor,1,sr_length(right_neighbor)); pieces_tup := chop_up("",all,right_neighbor_stg); -- use this as the new pieces_tup [leafj,cumj] := right_pair; -- use left_neighbor as the leafi end if; end if; -- after this the treatment is like that of all other cases cum_now := cumj; -- the following iteration is governed by the cumulants pieces_tup_ix_now := #pieces_tup; -- start at the end of pieces_tup while cum_now >= cumi and pieces_tup_ix_now >= 1 loop pass("nocon_iter"); oll := sr_length(bnr_comp(rec,cum_now)); -- the leaf which will be replaced bnr_set_comp(rec,cum_now,leaf := make_record(pieces_tup(pieces_tup_ix_now))); incref(leaf,-1); -- the variable 'leaf' is now dead --print("chopping up: ",#pieces_tup," ",hexify(leaf)); pieces_tup_ix_now -:= 1; -- backwards in pieces_tup cum_now -:= oll; -- backwards in the sequence of leaves end loop; while cum_now >= cumi loop -- may need to make extra deletions pass("nocon_del"); oll := sr_length(bnr_comp(rec,cum_now)); -- the leaf which will be replaced bnr_set_comp(rec,cum_now,OM); cum_now -:= oll; -- backwards in the sequence of leaves end loop; just_past_prev := cum_now + 1; -- just past the leaf previous to leafi while pieces_tup_ix_now >= 1 loop -- may need to make extra insertions pass("nocon_ins"); bnr_insert(rec,just_past_prev,leaf := make_record(pieces_tup(pieces_tup_ix_now))); incref(leaf,-1); -- the variable 'leaf' is now dead pieces_tup_ix_now -:= 1; -- backwards in pieces_tup end loop; if num_childr(old_rec := rec) = 1 then -- move from tree to simple string record representation pass("nocon_to_simp"); rec := bnr_voc(rec,1); incref(rec,1); incref(old_rec,-1); -- the child inherits one reference from its parent, so its refcount does not change end if; --print("tree_dump after: ",str(bnr_dump(rec))); end bs_set_slice; procedure bs_length(rec); -- length of a bigstring = string record length or tree length return if dr_load(rec)(type_byte) = string_record then sr_length(rec) else bnr_get_cum(rec) end if; end bs_length; procedure print_raw(); print(big_ix); end print_raw; -- raw print for debugging procedure bs_check_leaves(rec); -- checks that leaves have required maximum and minimum lengths if (refcount(int_of_4(rec)) <= 0) then print("**** MEMORY ERROR **** node with zero refcount in tree: ",hexify(rec)); return false; end if; if (contents := dr_load(rec))(type_byte) = string_record then -- node is a leaf if (srl := sr_length(rec)) > sr_hi_lim then print("EXCESSIVE STRING LENGTH DETECTED IN NODE ",hexify(rec)," ",hexify(contents)); return false; end if; return true; -- node is ok end if; -- otherwise we proceed recursively if (bsl := bs_length(rec)) /= (ls := #stg_from_bigstg(rec)) then print("***** Length Discrepancy ***** ",bsl," - ",ls," -> ",hexify(rec)," ",hexify(dr_load(rec))); stop; end if; return forall j in [1..num_childr(rec)] | bs_check_leaves_in(bnr_voc(rec,j)); procedure bs_check_leaves_in(rec); -- inner workhorse if (refcount(int_of_4(rec)) <= 0) then print("**** MEMORY ERROR **** node with zero refcount in subtree: ",hexify(rec)); return false; end if; if (contents := dr_load(rec))(type_byte) = string_record then -- node is a leaf if (srl := sr_length(rec)) > sr_hi_lim then print("EXCESSIVE STRING LENGTH DETECTED IN LEAF ",hexify(rec)," ",srl," ",hexify(contents)); return false; end if; if srl < sr_low_lim then print("INSUFFICIENT STRING LENGTH DETECTED IN LEAF ",hexify(rec)," ",srl," ",hexify(contents)); return false; end if; return true; -- leaf is ok end if; -- otherwise we proceed recursively if (bsl := bs_length(rec)) /= (ls := #stg_from_bigstg(rec)) then print("***** Length Discrepancy In Leaf***** ",bsl," ",ls); stop; end if; return forall j in [1..num_childr(rec)] | bs_check_leaves_in(bnr_voc(rec,j)); end bs_check_leaves_in; end bs_check_leaves; end big_string_pak; class big_string; -- simple wrapper object for big_string_pak var rec; -- disk record for big_string; this is either as simple string record or a B-tree root record -- (MADE PUBLIC FOR MEMORY USAGE DEBUGGING ONLY) procedure create(stg); -- create a big string from a string procedure print_raw(); -- raw print for debugging end big_string; class body big_string; -- simple wrapper object for big_string_pak use setldb,big_string_pak; procedure create(stg); rec := bs_from_stg(stg); end create; -- create a big string from a string procedure selfstr; return stg_from_bigstg(rec); end; -- string form of a big string procedure self(i..j); -- the slice extraction operation concatenates -- all the characters between the two indicated positions return bs_slice(rec,i,j); end; procedure self(i..j) := stg; -- slice assignment operation bs_set_slice(rec,i,j,stg); -- this can change rec; when the refcount of this object goes to 0, -- rec should be erased end; procedure #self; return bs_length(rec); end; -- length of this string procedure print_raw(); print(big_ix); end print_raw; -- raw print for debugging end big_string; program test_bs; -- test program for big_string package and its associated class use setldb,big_string_pak,big_string; use byteutil,disk_records_pak,db_records,B_tree_for_bigstring,string_utility_pak; var orig := "123456789a123456789b123456789c" + "123456789d123456789e123456789f" + "123456789g123456789h123456789i" + "123456789j123456789k123456789lABCDE"; -- string for tests var orig2 := "ABCDEFGHIaABCDEFGHIbABCDEFGHIc" + "ABCDEFGHIdABCDEFGHIeABCDEFGHIf" + "ABCDEFGHIg1ABCDEFGHIhABCDEFGHIi" + "ABCDEFGHIjABCDEFGHIkABCDEFGHIzZZZZZ"; -- string for tests code_pts := bs_code_pts + bnr_code_pts; -- code points to be traversed database_file_name := "bs_test_file"; -- set the name of the file to be used package_tests; -- test the underlying package big_string_class_test; -- test the class version report_points_passed(); -- get traversal report procedure check_mem(caption,rec); -- memory check utility incref(rec,-1); print(caption," ",str(if is_tuple(cm := check_memory()) then cm(1) else cm end if)); end check_mem; procedure package_tests; -- tests of the underlying package s2 := stg_from_bigstg(rc := bs_from_stg(s1 := orig)); print("short reconstruction: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 1",rc); s2 := stg_from_bigstg(rc := bs_from_stg(s1 := orig + orig(1..10))); print("long reconstruction: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 2",rc); s2 := stg_from_bigstg(rc := bs_from_stg(s1 := 5 * orig + orig(1..10))); print("longer reconstruction: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 3",rc); s2 := stg_from_bigstg(rc := bs_from_stg(s1 := 5 * orig)); print("longer reconstruction 2: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 4",rc); s2 := stg_from_bigstg(rc := bs_from_stg(s1 := 2 * orig + orig(1..10))); print("longer reconstruction 3: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 5",rc); s2 := stg_past(5,rc := bs_from_stg(s1 := orig)); print("end of short reconstruction: ",s1(5..) = s2); check_mem("memcheck 6",rc); s2 := stg_past(5,rc := bs_from_stg(s1 := orig + orig(1..10))); print("end of long reconstruction: ",s1(5..) = s2); check_mem("memcheck 7",rc); s2 := stg_past(5,rc := bs_from_stg(s1 := 5 * orig + orig(1..10))); print("end of longer reconstruction: ",s1(5..) = s2); check_mem("memcheck 8",rc); s2 := stg_past(5,rc := bs_from_stg(s1 := 5 * orig)); print("end of longer reconstruction 2: ",s1(5..) = s2); check_mem("memcheck 9",rc); s2 := stg_past(5,rc := bs_from_stg(s1 := 2 * orig + orig(1..10))); print("end of longer reconstruction 3: ",s1(5..) = s2); check_mem("memcheck 10",rc); s2 := stg_till(95,rc := bs_from_stg(s1 := orig)); print("start of short reconstruction: ",s1(1..95) = s2); check_mem("memcheck 11",rc); s2 := stg_till(130,rc := bs_from_stg(s1 := orig + orig(1..10))); print("start of long reconstruction: ",s1(1..130) = s2); check_mem("memcheck 12",rc); s2 := stg_till(630,rc := bs_from_stg(s1 := 5 * orig + orig(1..10))); print("start of longer reconstruction: ",s1(1..630) = s2); check_mem("memcheck 13",rc); s2 := stg_till(620,rc := bs_from_stg(s1 := 5 * orig)); print("start of longer reconstruction 2: ",s1(1..620) = s2); check_mem("memcheck 14",rc); s2 := stg_till(255,rc := bs_from_stg(s1 := 2 * orig + orig(1..10))); print("start of longer reconstruction 3: ",s1(1..255) = s2); check_mem("memcheck 15",rc); s2 := bs_slice(rc := bs_from_stg(s1 := orig),5,95); print("short slice: ",s1(5..95) = s2); check_mem("memcheck 16",rc); s2 := bs_slice(rc := bs_from_stg(s1 := orig + orig(1..10)),5,130); print("long slice: ",s1(5..130) = s2); check_mem("memcheck 17",rc); s2 := bs_slice(rc := bs_from_stg(s1 := 5 * orig + orig(1..10)),5,630); print("longer slice: ",s1(5..630) = s2); check_mem("memcheck 18",rc); s2 := bs_slice(rc := bs_from_stg(s1 := 5 * orig),5,620); print("longer slice 2: ",s1(5..620) = s2); check_mem("memcheck 19",rc); s2 := bs_slice(rc := bs_from_stg(s1 := 2 * orig + orig(1..10)),5,255); print("longer slice 3: ",s1(5..255) = s2); check_mem("memcheck 20",rc); rc := bs_from_stg(s1 := 4 * orig + orig(1..10)); bs_set_slice(rc,3,500,""); s1(3..500) := ""; -- test slice insertion operation, long form s2 := stg_from_bigstg(rc); print("long slice shortening insert: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 21",rc); rc := bs_from_stg(s1 := orig + orig(1..10)); bs_set_slice(rc,3,2,"XX"); s1(3..2) := "XX"; -- test slice insertion operation, long form s2 := stg_from_bigstg(rc); print("long slice insert: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 22",rc); rc := bs_from_stg(s1 := orig + orig(1..10)); bs_set_slice(rc,3,2,3 * orig2);s1(3..2) := 3 * orig2; -- test slice insertion operation, long form s2 := stg_from_bigstg(rc); print("long slice long insert: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 23",rc); rc := bs_from_stg(s1 := orig); bs_set_slice(rc,1,3,"XX"); s1(1..3) := "XX"; -- test slice assignment operation, short form s2 := stg_from_bigstg(rc); print("short slice shrink-a-bit: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 24",rc); rc := bs_from_stg(s1 := orig); bs_set_slice(rc,1,3,""); s1(1..3) := ""; -- test slice assignment operation, short form s2 := stg_from_bigstg(rc); print("short slice shrinkage: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 25",rc); rc := bs_from_stg(s1 := orig); bs_set_slice(rc,1,3,"YYYYY"); s1(1..3) := "YYYYY"; -- test slice assignment operation, short form s2 := stg_from_bigstg(rc); print("short slice expand: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 26",rc); rc := bs_from_stg(s1 := orig); bs_set_slice(rc,1,0,"ZZZZZ"); s1(1..0) := "ZZZZZ"; -- test slice assignment operation, short form s2 := stg_from_bigstg(rc); print("short slice insert: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 27",rc); rc := bs_from_stg(s1 := orig + orig(1..10)); rl := bs_length(rc); bs_set_slice(rc,rl + 1,rl,"XX"); s1(rl + 1..rl) := "XX"; -- test slice append operation, short form s2 := stg_from_bigstg(rc); print("long slice append: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 28",rc); rc := bs_from_stg(s1 := 2 * orig + orig(1..10)); rl := bs_length(rc); bs_set_slice(rc,rl + 1,rl,"WWWWWWW"); s1(rl + 1..rl) := "WWWWWWW"; -- test slice append operation, long form s2 := stg_from_bigstg(rc); print("long slice append 2: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 29",rc); rc := bs_from_stg(s1 := 5 * orig + orig(1..10)); rl := bs_length(rc); bs_set_slice(rc,rl + 1,rl,"YYYYY"); s1(rl + 1..rl) := "YYYYY"; -- test slice append operation, longer form s2 := stg_from_bigstg(rc); print("long slice append 3: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 30",rc); rc := bs_from_stg(s1 := 5 * orig + orig(1..10)); rl := bs_length(rc); bs_set_slice(rc,rl + 1,rl,3 * orig); s1(rl + 1..rl) := 3 * orig; -- test slice append operation, longer form s2 := stg_from_bigstg(rc); print("long slice long append: ",s1 = s2 and bs_check_leaves(rc)); check_mem("memcheck 30A",rc); -- additional tests of the slice assignment operations, to proble all the special cases -- in the code, and verify the absence of memory leaks or faulty deallocations end package_tests; procedure big_string_class_test; -- test program for big_string class the_stg := big_string(s1 := orig); print("string length and selfstr: ",#the_stg = #s1," ",str(the_stg) = s1); print("short slice class: ",the_stg(5..95) = s1(5..95) and bs_check_leaves(the_stg.rec)); check_mem("memcheck 31",the_stg.rec); the_stg := big_string(s1 := orig + orig(1..10)); print("long slice class: ",the_stg(5..130) = s1(5..130) and bs_check_leaves(the_stg.rec)); check_mem("memcheck 32",the_stg.rec); the_stg := big_string(s1 := 5 * orig + orig(1..10)); print("longer slice class: ",the_stg(5..630) = s1(5..630) and bs_check_leaves(the_stg.rec)); check_mem("memcheck 33",the_stg.rec); the_stg := big_string(s1 := 5 * orig); print("longer slice class 2: ",the_stg(5..620) = s1(5..620) and bs_check_leaves(the_stg.rec)); check_mem("memcheck 34",the_stg.rec); the_stg := big_string(s1 := 2 * orig + orig(1..10)); print("longer slice class 3: ",the_stg(5..255) = s1(5..255) and bs_check_leaves(the_stg.rec)); check_mem("memcheck 35",the_stg.rec); the_stg := big_string(""); the_stg(1..0) := 5 * "abcde"; print("insert from null: ",str(the_stg) = 5 * "abcde"); the_stg(26..25) := 5 * orig; print("tail insert from null: ", str(the_stg) = (5 * "abcde" + 5 * orig) and bs_check_leaves(the_stg.rec)); -- check_mem("memcheck 36",the_stg.rec); the_stg := big_string(ori := ori_copy := 5 * "abcde" + 5 * orig); --print("the_stg after creation: ",str(the_stg)," --- the_stg: ",the_stg); -- ?????????? SETL PRINT BUG ?????????? --print("the_stg after creation: ",str(the_stg)); -- THIS WORKS OK; PREVIOUS LINE DOES NOT print("string length, selfstr, length check,selfstr check: ",#the_stg = #ori," ",str(the_stg) = ori); stg_copy := the_stg; incref(the_stg.rec,1); -- note that an extra copy has been created print("slice extraction and original: ",the_stg(1..10) = ori(1..10) and bs_check_leaves(the_stg.rec)); print("second slice extraction and original: ",the_stg(10..20) = ori(10..20)); print("slice extraction check: ",the_stg(1..10) = ori(1..10)); print("second slice extraction check: ",the_stg(10..20) = ori(10..20)); print("tail slice check: ",the_stg(10..#ori) = ori(10..#ori)); the_stg(1..0) := (extra := "XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX."); ori2 := ori; ori(1..0) := extra; print("concatenation of 50 characters at start: ",#the_stg," ",str(the_stg) = ori and bs_check_leaves(the_stg.rec)); print("the_stg: ",str(the_stg)); print(); print(str(stg_copy)); incref(the_stg.rec,-1); -- delete the string -- we must try long and short pure insertions, long and short impure insertions -- into a single section, and long and short insertions which erase longer -- runs the_stg := stg_copy; incref(stg_copy.rec,1); ori := ori_copy; ori(1..0) := "XXX"; the_stg(1..0) := "XXX"; print("insertion of 3 characters at start: ",#the_stg," ",str(the_stg) = ori); -- pure short insertion case incref(the_stg.rec,-1); -- delete the string the_stg := stg_copy; incref(stg_copy.rec,1); ori := ori_copy; ori(21..20) := "XXX"; the_stg(21..20) := "XXX"; print("insertion of 3 characters after position 20: ",#the_stg," ",str(the_stg) = ori); -- pure short insertion case incref(the_stg.rec,-1); -- delete the string the_stg := stg_copy; incref(stg_copy.rec,1); ori := ori_copy; ori(10..12) := "XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX."; the_stg(10..12) := "XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX."; print("50 Xs replace chars 10 thru 12: ",#the_stg," ",str(the_stg) = ori); incref(the_stg.rec,-1); -- delete the string the_stg := stg_copy; incref(stg_copy.rec,1); ori := ori_copy; the_stg(10..400) := "XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX."; ori(10..400) := "XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX.XXXX."; print("50 Xs replace chars 10 thru 400: ",#the_stg," ",str(the_stg) = ori); --the_stg.print_raw(); incref(the_stg.rec,-1); -- delete the string the_stg := stg_copy; incref(stg_copy.rec,1); ori := ori_copy; ori(10..12) := "XXX"; the_stg(10..12) := "XXX"; print("3 Xs replace chars 10 thru 12: ",#the_stg," ",str(the_stg) = ori); incref(the_stg.rec,-1); -- delete the string the_stg := stg_copy; incref(stg_copy.rec,1); ori := ori_copy; ori(150..625) := "XXX"; the_stg(150..625) := "XXX"; print("3 Xs replace chars 150 thru 625: ",#the_stg," ",str(the_stg) = ori); incref(the_stg.rec,-1); -- delete the string the_stg := stg_copy; incref(stg_copy.rec,1); ori := ori_copy; ori(10..400) := "XXX"; the_stg(10..400) := "XXX"; print("3 Xs replace chars 10 thru 40: ",#the_stg," ",str(the_stg) = ori); incref(the_stg.rec,-1); -- delete the string the_stg := stg_copy; incref(stg_copy.rec,1); ori := ori_copy; ori(10..12) := ""; the_stg(10..12) := ""; print("chars 10 thru 12 deleted: ",#the_stg," ",str(the_stg) = ori); -- pure deletion case incref(the_stg.rec,-1); -- delete the string the_stg := stg_copy; incref(stg_copy.rec,1); ori := ori_copy; ori(10..400) := ""; the_stg(10..400) := ""; print("chars 10 thru 40 deleted: ",#the_stg," ",str(the_stg) = ori); -- pure long deletion case incref(the_stg.rec,-1); -- delete the string print("first character deletion test"); ori := ori_copy; the_stg1 := stg_copy; incref(stg_copy.rec,1); the_stg1(1..1) := ""; ori(1..1) := ""; print(str(the_stg1) = ori); the_stg2 := the_stg1; incref(the_stg1.rec,1); the_stg2(1..1) := ""; ori(1..1) := ""; print(str(the_stg2) = ori); the_stg3 := the_stg2; incref(the_stg2.rec,1); the_stg3(1..1) := ""; ori(1..1) := ""; print(str(the_stg3) = ori); incref(the_stg3.rec,-1); incref(the_stg2.rec,-1); incref(the_stg1.rec,-1); -- delete the string copy check_mem("memcheck 37",stg_copy.rec); -- final erasure end big_string_class_test; end test_bs; -- ************** THE MAIN DATABASE CLASS AND ITS AUXILIARY CLASSES ************** -- Internally, the database 'records' are long string sections, defined -- by their start, end characters [start, the_end]. These are accessed through an index, -- which is a cumulating vector of pairs [record_id, rec_len] with both record_id and cum_len -- in increasing order. record_id cumulates using the maximum function, and rec_len by integer addition. -- Each database has a to_string method which can be applied to each of its -- records, which in turn generates word_list(record); This word list is -- used to insert all the word occurences in the record into a 'word index' -- associated with the database. -- To find a record in the database given its key, we write -- [[key,rec_len],key_ix,cum] := big_ix{key..key_cum}; -- start := cum - rec_len + 1; -- Insertions of new records into the database are simply -- start := #db_record_string; db_record_string(start + 1..start) := binstr(record); -- where 'record' is the new record, followed by -- id_ctr +:= 1; db_index{OM} := [hex_rep(id_ctr),#db_record_string]; -- and then by insertion of the word occurences in the record -- into the word index. -- To over_write a record within the database string db, we write -- db_record_string(start..the_end) := x := binstr(record), followed by -- db_index(key_ix) := [key,#x]; and by appropriate revision of the word index. -- Database deletions are then db_record_string(start..the_end) := "", followed by -- big_ix(key_ix) := OM; and by deletion of the word occurences in the record -- from the word index. package db_iterator_pak; -- contains_iterator package for the SETL database class const simple := 1, union := 2, difference := 3, intersection := 4; sel iterator_kind(1); -- can be simple, union, difference, unstarted intersection, -- started intersection -- components of simple iterators sel occs_tree(2); -- big tuple containing the record keys referencing the words in the_word_ix sel current_occ_ptr(3); -- pointer to index of current occurence sel occs_start(4); -- first item of occurence list sel occs_end(5); -- last item of occurence list -- components of compound iterators sel L1_child(2),L2_child(3); -- sub-iterators if compound iteration sel last_L1(4),last_L2(5); -- items previously obtained for this cycle of -- compound iteration var written_refcount,written_free_list,written_in_use; -- DEBUGGING QUANTITIES, PALCED HER FOR CONVENIENCE procedure itr_create(occs_tup,beg,nd); -- create a simple contains_iterator for a SETL database procedure itr_union(iter1,iter2); -- iterator union procedure itr_intersection(iter1,iter2); -- iterator intersection procedure itr_diff(iter1,iter2); -- iterator difference procedure itr_start(tup); -- begin iterating over records containing word procedure itr_next(tup); -- continue iterating over records containing word procedure itr_number(tup); -- count operator procedure itr_arb(tup); -- arb operator; returns first element procedure itr_destroy(tup); -- destructor routine; eliminates reference to occs_tup end db_iterator_pak; package body db_iterator_pak; -- contains_iterator package for the SETL database class use setldb,B_tree_for_wdocstring,big_stg_for_wdoc_pak,byteutil,disk_records_pak,db_records; -- the underlying record and paging libraries procedure itr_create(occs_tup,beg,nd); -- create a simple contains_iterator for a SETL database --print("itr_create: ",beg," ",nd); tup := [1..3]; -- create the representing tuple. The last 2 components, occs_start and occs_end, are OM tup.iterator_kind := simple; tup.current_occ_ptr := newat(); -- initial reference is to OM tup.occs_tree := occs_tup; -- retain occurence list identity, and create an extra reference to it incref(occs_tup,1); -- add one reference to occs_tup if beg = OM then return tup; end if; -- return a null iterator -- search for the desired word. If it is not found, set current_oc = OM; -- this will terminate iteration as soon as it begins. Otherwise set ^current_occ_ptr -- to point to the first identifier in the identifiers list following the -- word (or to OM if this list is empty.) Iteration will terminate as soon as -- the next word item in the word index is encountered. -- The word index is maintained as a vector of items word, num_occs, ... -- etc. with the number of words as a cumulant tup.occs_end := nd; ^tup.current_occ_ptr := (tup.occs_start := beg) - 1; -- first item of occurence list --print("created iterator: ",tup); return tup; end itr_create; procedure itr_destroy(tup); -- destructor routine; eliminates a reference to occs_tup incref(tup.occs_tree,-1); -- eliminate one reference to occs_tup tup.current_occ_ptr := OM; -- prevent subsequent iteration end itr_destroy; procedure itr_union(iter1,iter2); -- iterator union -- last_L1(4),last_L2(5) are initialized to pointer atoms new_iter := [OM,OM,OM,newat(),newat()]; -- create an uninitialized new iterator new_iter.iterator_kind := union; new_iter.L1_child := iter1; -- set the two children new_iter.L2_child := iter2; return new_iter; end itr_union; procedure itr_intersection(iter1,iter2); -- iterator intersection -- last_L1(4),last_L2(5) are initialized to pointer atoms new_iter := [OM,OM,OM,newat(),newat()]; -- create an uninitialized new iterator new_iter.iterator_kind := intersection; new_iter.L1_child := iter1; -- set the two children new_iter.L2_child := iter2; return new_iter; end itr_intersection; procedure itr_diff(iter1,iter2); -- iterator difference -- last_L1(4),last_L2(5) are initialized to pointer atoms new_iter := [OM,OM,OM,newat(),newat()]; -- create an uninitialized new iterator new_iter.iterator_kind := difference; new_iter.L1_child := iter1; -- set the two children new_iter.L2_child := iter2; return new_iter; end itr_diff; procedure itr_start(tup); -- begin iterating over records containing word case tup.iterator_kind when simple => -- restart the iteration if possible if tup.occs_start /= OM then ^(tup.current_occ_ptr) := tup.occs_start - 1; end if; when union,difference,intersection => -- for the compound cases, just set up both values itr_start(tup.L1_child); itr_start(tup.L2_child); -- start both children ^(tup.last_L1) := itr_next(tup.L1_child); -- set up both initial values ^(tup.last_L2) := itr_next(tup.L2_child); end case; end itr_start; procedure itr_next(tup); -- continue iterating over records containing word case tup.iterator_kind when simple => -- if uninitialized or terminated, return OM if (tcop := tup.current_occ_ptr) = OM or ^tcop = OM then return OM; end if; if (current_ix := (^tcop +:= 1)) > tup.occs_end then ^tcop := OM; return OM; -- iteration ends at occs_end end if; return int_of_4(bswo_comp(tup.occs_tree,current_ix)); -- otherwise return the current record identifier, as an integer when union => -- for the union, we simply return the smaller of the two -- items, and advance it if (las_1 := ^(tup.last_L1)) = OM then if ^(tup.last_L2) = OM then return OM; end if; -- iteration is finished to_return := ^(tup.last_L2); ^(tup.last_L2) := itr_next(tup.L2_child); -- advance the L2 iteration return to_return; -- return the L2 iterator elseif (las_2 := ^(tup.last_L2)) = OM then to_return := ^(tup.last_L1); ^(tup.last_L1) := itr_next(tup.L1_child); -- advance the L1 iteration return to_return; -- return the L1 iterator elseif las_1 <= las_2 then to_return := las_1; -- return the smaller if las_1 = las_2 then ^(tup.last_L2) := itr_next(tup.L2_child); end if; -- advance L2 if it equals the L1 being returned ^(tup.last_L1) := itr_next(tup.L1_child); -- advance the smaller iteration return to_return; -- return the L1 iterator else to_return := las_2; -- return the smaller ^(tup.last_L2) := itr_next(tup.L2_child); -- advance the smaller iteration return to_return; -- return the L2 iterator end if; when difference => -- for the difference L1 - L2, we return and advance the L1 item if -- it is the smaller of the two; otherwise we advance the L2 item -- until it becomes equal to or greater than the L1 item. At each -- equality, the L1 item is also increased. --print("difference iter; last_L1,last_L2: ",^(tup.last_L1)," ",^(tup.last_L2)); if (las_1 := ^(tup.last_L1)) = OM then return OM; end if; -- iteration is finished if (las_2 := ^(tup.last_L2)) = OM or las_1 < las_2 then to_return := las_1; -- return the L1 iterator ^(tup.last_L1) := itr_next(tup.L1_child); -- advance the L1 iteration return to_return; -- return the L1 iterator elseif las_1 = las_2 then -- bypass common element ^(tup.last_L1) := itr_next(tup.L1_child); -- advance the L1 iteration end if; -- now last_L1 is larger while (las_1 := ^(tup.last_L1)) /= OM loop -- we move forward (in both lists if necessary) -- till last_L2 exceeds last_L1 while (las_2 := ^(tup.last_L2)) /= OM and las_1 > las_2 loop ^(tup.last_L2) := itr_next(tup.L2_child); end loop; if las_1 = las_2 then -- must advance last_L1 again ^(tup.last_L1) := itr_next(tup.L1_child); continue; end if; to_return := las_1; -- return the L1 iterator ^(tup.last_L1) := itr_next(tup.L1_child); -- advance the L1 iteration return to_return; -- return the L1 iterator end loop; return OM; -- iteration is finished when intersection => -- for the intersection, we advance the smaller of the two -- items until a common value is found, when we advance both while (las_1 := ^(tup.last_L1)) /= OM and (las_2 := ^(tup.last_L2)) /= OM loop if las_1 = las_2 then -- we have an intersection element to_return := las_1; -- return the L1 iterator ^(tup.last_L1) := itr_next(tup.L1_child); -- advance the L1 iteration return to_return; -- return the L1 iterator end if; if las_1 < las_2 then -- advance the smaller element ^(tup.last_L1) := itr_next(tup.L1_child); -- advance the L1 iteration else ^(tup.last_L2) := itr_next(tup.L2_child); -- advance the L2 iteration end if; end loop; return OM; -- iteration is finished end case; end itr_next; procedure itr_number(tup); -- count operator case tup.iterator_kind when simple => if tup.occs_start = OM then return 0; end if; -- word not found return tup.occs_end - tup.occs_start + 1; when union,difference,intersection => -- count occurences matching criterion num := 0; itr_start(tup); while itr_next(tup) /= OM loop num +:= 1; end loop; return num; end case; end itr_number; procedure itr_arb(tup); -- arb operator; returns first element --print("itr_arb: ",tup.iterator_kind); case tup.iterator_kind when simple => -- restart the iteration if necessary if (tos := tup.occs_start) = OM then return OM; end if; iof := int_of_4(bswo_comp(tup.occs_tree,tos)); return iof; -- otherwise return the first record identifier from the sequence for this word when union,difference,intersection => -- for the compound cases, just set up both values itr_start(tup); -- begin iterating over records containing word item := itr_next(tup); -- take the first item return if item = OM then OM else item end if; -- return the first item end case; end itr_arb; end db_iterator_pak; class contains_iterator; -- contains_iterator for the SETL database class -- these iterators represent the set of all database records which -- contain a given word. They support iteration and the count operation; -- along with intersection, union, and difference. procedure create(the_tup); -- create contains_iterator for a SETL database end contains_iterator; class body contains_iterator; -- contains_iterator for the SETL database class use setldb,byteutil,disk_records_pak,db_records,db_iterator_pak; -- for preliminary testing var tuple; -- representing quintuple for the iteration procedure create(the_tup); -- create contains_iterator for a SETL database -- tuple is [iterator_kind = kind,occs_tree,current_occ_ptr = OM,start,end] if simple; -- (but [kind,L1_child,L2_child,last_L1,last_L2] if compound) tuple := if the_tup.iterator_kind = simple then itr_create(the_tup(2),the_tup(4),the_tup(5)) else the_tup end if; -- use the package in the simple case; otherwise just save end create; procedure #self; return itr_number(tuple); end; -- count operator; use the package procedure self + x; return contains_iterator(itr_union(tuple,x.tuple)); end; -- iterator union procedure self * x; return contains_iterator(itr_intersection(tuple,x.tuple)); end; -- iterator intersection procedure self - x; return contains_iterator(itr_diff(tuple,x.tuple)); end; -- iterator difference procedure arb self; return if (res := itr_arb(tuple)) = OM then OM else stg_of_4(res) end if; end; -- select an arbitrary record id which references this word or words -- Note that itr_arb returns OM or an integer procedure iterator_start; itr_start(tuple); end; -- begin iterating over records containing word procedure iterator_next; return if (res := itr_next(tuple)) = OM then OM else [stg_of_4(res)] end if; end; -- continue iterating over records containing word end contains_iterator; -- The operations supported by the SETL database class seen below are as follows: -- db(key); -- find a database record given its key -- db(key) := x; -- store a database record, associating it with the specified key -- db with x -- create a new record and put x into it. db is returned. -- db.to_string; -- database-associated mapping of records to strings; see comment below -- db.contains(wd); -- creates and returns an iterator object for the given word -- iterations 'x in iter_obj' over these objects return the sequence of -- keys of all records containing the specified words. These iteration -- objects support the operations: #iter_obj (number of keys in the iteration group) -- arb iter_obj (first item int the iteration group, or OM if none) -- iter_obj + iter_obj (union iteration), iter_obj * iter_obj (intersection), -- iter_obj - iter_obj (iterate over difference) class database; -- SETL database class var what_tree; -- tree name for debugging var to_string := OM; -- database-associated mapping of records to strings -- this public variable should be set just once, immediately after the -- database is created. var words_just_changed; -- words affected in last database operation var record_string; -- the big string of all the records var record_index; -- maps record identifiers into record ends -- this is a big tuple consisting of record identifiers and record lengths, -- with the identifiers in order. The identifiers cumulate as strings, the lengths as integers. var identifier_generator := 0; -- cumulating generator for record identifiers var words_index; -- this is a big tuple consisting of pairs [words,len_occ_rec], where len_occ_rec is -- the length (in 'occ_rec_tup', see below) of the list of records which contain the word. -- the words are in order var occ_rec_tup; -- big tuple, containing lists of word occurences (one for each -- record in which a given word occurs. Each such list of words contains -- record identifiers in increasing order procedure create(my_to_string,file_name); -- create or reopen a SETL database procedure write_db(); -- write database to its file procedure contains(wd); -- returns the 'contains' iterator object for a given word procedure dump(); -- dump of database (DEBUGGING ONLY) procedure check_consistency(caption); -- miscelaneous consistency checks (DEBUGGING ONLY) procedure dump_recstring(); -- dump of database record_string (DEBUGGING ONLY) procedure write_refcount_data(); -- write out the refcount data (PUBLIC FOR DEBUGGING ONLY) procedure dump2(x); end database; class body database; -- SETL database class use setldb,string_utility_pak,sort_pak; -- utilities, e.g. for breaking strings at whitespace, etc. use contains_iterator; -- the iterator objects use byteutil,disk_records_pak,db_records; -- the underlying record and paging libraries use B_tree_for_bigstring,B_tree_for_wdocstring,B_tree_for_dbix,B_tree_for_wdix,big_string_pak,big_stg_for_wdoc_pak; use db_iterator_pak; -- the B_trees representing the principal database components and their indices var num_refcount_items_in_rec := rec_size/4 - 1; -- number of refcount items in a record procedure create(my_to_string,file_name); -- create or reopen a SETL database -- if file_name starts with a minus sign, we strip of the minus sign and start a new database in the indicated file -- otherwise we read the file, which should contain a previously written database, and use its contents to -- initialize the database may_minus := match(file_name,"-"); database_file_name := file_name; -- the global database_file_name is set from file_name if may_minus = "" then -- read the file and use its contents to initialize the database read_db(file_name); to_string := my_to_string?lambda(rec); return ""; end lambda; return; end if; -- otherwise the database (and its representing trees) are initially empty. close(open(file_name,"TEXT-OUT")); -- delete the file if it already exists; othewise create it to_string := my_to_string?lambda(rec); return ""; end lambda; -- by default (since a default is required), all records map to the null string record_string := dr_new_rec(); dr_load(record_string); -- we start with an empty string of records srgg:=dr_load(record_string); srgg(type_byte) := string_record; dr_setrecbuf(record_string,srgg); dr_dirtify(record_string); record_index := dbix_create(); -- we start with an empty index of records; accumulates rec_id and length occ_rec_tup := dr_new_rec(); srgg:=dr_load(occ_rec_tup); srgg(type_byte) := wdoccs_string_record; dr_setrecbuf(occ_rec_tup,srgg); dr_dirtify(occ_rec_tup); -- initially no word has any occurence. -- the occurences tuple is a list of record identifiers, divided into sections -- by the cumulative counts held in 'words_index'. Within each such section -- the record identifiers are in order; they cumulate as strings and by count words_index := wix_create(); -- we start with an empty index of word occurences -- the words index has 2 cumulants, namely the words themselves, -- accumulated using 'max', and the total length of the occurence list for each word end create; procedure identity_map(x); return x; end identity_map; -- identity mapping procedure self(key); -- find a database record given its key -- the database record index call dbix_comp_cum2(rec,x) returns values of the form [rid,len,cum_len] --print("Self [record_index,key]=",[record_index,int_of_4(key)]); if (found := dbix_comp_cum2(record_index,key)) = OM then return OM; end if; -- item might not be found [key_in,rec_len,stg_end] := found; --print("Self(key)... [key_in,rec_len,stg_end]=", -- [int_of_4(key_in),rec_len,stg_end]); if key_in /= key then return OM; end if; -- no such key found --print(stg_end-rec_len+1," ",stg_end," Rec ",hexify(record_string)); return unbinstr(bs_slice(record_string,stg_end - rec_len + 1,stg_end)); -- otherwise decode the record --return bs_slice(record_string,stg_end - rec_len + 1,stg_end); -- otherwise decode the record end; procedure self(key) := rec; -- update a database record using its key -- if rec = OM we delete the database record with the given key -- print("Self:= [record_index,key]=",[record_index,int_of_4(key)]); [key_in,rec_len,stg_end] := dbix_comp_cum2(record_index,key); -- print("Self(key):=rec ... [key_in,rec_len,stg_end]=", -- [int_of_4(key_in),rec_len,stg_end]); if rec = OM then -- deletion is desired if key_in /= key then return; end if; -- key not present so no deletion needed being_deleted := unbinstr(bs_slice(record_string,strt := stg_end - rec_len + 1,stg_end)); -- the record being deleted deleted_wds := {wd: wd in breakup(to_string(being_deleted),"\n\t\r ") | wd /= ""}; -- set of words being deleted delete_wds(deleted_wds,key); -- delete these word occurences from the words_index bs_set_slice(record_string,strt,stg_end,""); -- delete the record from the record_string dbix_set_comp2(record_index,key,OM); -- delete the record data from the record_index words_just_changed := { }; -- we don't count deletions as changes return; -- done with this case end if; -- otherwise a record is being changed start_m1 := stg_end - rec_len; -- the record will be written after this position added_wds := {wd: wd in breakup(to_string(rec),"\n\t\r ") | wd /= ""}; -- set of words being added words_just_changed := added_wds; if key_in /= key then -- key not present, so just insert new record, using key -- Note: this operation should be quite rare -- get the string start of the record before which the insertion will -- take place bs_set_slice(record_string,start_m1 + 1,start_m1,bstr := binstr(rec)); -- insert the record into the record_string dbix_set_comp2(record_index,key_in,[key,#bstr]); -- insert the record data into the record_index, immediately prior to key_in add_wds(added_wds,key); -- add these word occurences to the words_index return; -- done with this case end if; -- otherwise a record is actually being changed. Find the new words being -- added, and the old words being subtracted old_record := unbinstr(bs_slice(record_string,start_m1 + 1,stg_end)); -- the record as it was deleted_wds := {wd: wd in breakup(to_string(old_record),"\n\t\r ") | wd /= ""}; -- set of words being deleted new_wds := added_wds - deleted_wds; -- the new words to be indexed add_wds(new_wds,key); -- add these word occurences to the words_index --print("new_wds,key,words_index: ",new_wds," ",key," ",words_index); deleted_wds -:= added_wds; -- set of words really being deleted delete_wds(deleted_wds,key); -- delete these word occurences from the words_index bs_set_slice(record_string,start_m1 + 1,stg_end,bstr := binstr(rec)); -- change the record in the record_string dbix_set_comp2(record_index,key,[key,#bstr]); -- change the record data in the record_index words_just_changed := added_wds; end; procedure delete_wds(wd_list,key); -- delete word occurences from the words_index --if "D_4" in wd_list then -- print(what_tree," delete_wds - wd_list,key: ",wd_list," ",hexify(key)," ",wix_dump(words_index)," ", -- bswo_length(occ_rec_tup)," ",hexify(occ_rec_tup)," ",hexify(stg_from_bswostg(occ_rec_tup))); --end if; for wd in wd_list loop -- search over the words in order -- find index of 'wd' in the words_index [-,occs_len,up_limix] := wix_comp_cum2(words_index,wd); -- get boundaries of word occurences section [key_in,key_ix] := wo_first_past_to(occ_rec_tup,key,up_limix - occs_len + 1,up_limix); -- key_in must be identical with key bswo_set_comp(occ_rec_tup,key_ix,OM); -- delete the key occurence -- if the last occurence of the word has just been deleted, then delete -- the word itself if occs_len = 1 then wix_set_comp2(words_index,wd,OM); continue; end if; -- otherwise reduce the number of occurences in the words_index by 1 wix_set_comp2(words_index,wd,[wd,occs_len - 1]); end loop; --if "D_4" in wd_list then -- print(what_tree," at end of delete_wds: ",wix_dump(words_index)," ",bswo_length(occ_rec_tup)," ", -- hexify(occ_rec_tup)," ",hexify(stg_from_bswostg(occ_rec_tup))); --end if; end delete_wds; procedure add_wds(wd_list,key); -- insert word occurences into the words_index --print(what_tree," add_wds - wd_list,key: ",wd_list," ",hexify(key)); for wd in wd_list loop -- search over the words in order -- find index of 'wd' in the words_index --print("adding: ",wd," ",hexify(key)); if (item:= wix_comp_cum2(words_index,wd)) = OM then -- wd not bounded; insert it at end wix_insert(words_index,OM,[wd,1]); -- there is just one occurence --print("not bounded: ",hexify(occ_rec_tup)," ",hexify(stg_from_bswostg(occ_rec_tup))); bswo_insert(occ_rec_tup,OM,key); -- insert the occurence at the end of occ_rec_tup continue; -- done with this case end if; [wd_there,occs_len,up_limix]:= item; -- get wd in position, and boundaries of word occurences section if wd_there /= wd then -- word not present; insert before word found --print("not present: ",wd_there); wix_insert2(words_index,wd_there,[wd,1]); -- insert: there is just one occurence bswo_insert(occ_rec_tup,up_limix - occs_len + 1,key); -- insert the occurence before all the priors continue; -- done with this case end if; -- otherwise the word already occurs in some record; but the new -- occurence must be inserted low_limix := up_limix - occs_len + 1; -- we search for the key in the indicated range poss_occ_key_ix := wo_first_past_to(occ_rec_tup,key,low_limix,up_limix); if poss_occ_key_ix = OM then -- there is no later occurence on record for this -- word, so it should be inserted at the end of -- the occurence list section to which it belongs bswo_insert(occ_rec_tup,up_limix + 1,key); -- insert the key occurence at section end --print("already occurs: ",wd," ",[wd,occs_len + 1]); wix_set_comp2(words_index,wd,[wd,occs_len + 1]); -- note that there is one additional occurence continue; -- done with this case end if; [-,next_key_ix] := poss_occ_key_ix; bswo_insert(occ_rec_tup,next_key_ix,key); -- insert the key occurence before the following occurence wix_set_comp2(words_index,wd,[wd,occs_len + 1]); -- note that there is one additional occurence end loop; --print(what_tree," at end of add_wds: ",wix_dump(words_index)," ",bswo_length(occ_rec_tup)," ",hexify(occ_rec_tup)," ",hexify(stg_from_bswostg(occ_rec_tup))); end add_wds; procedure self with rec; -- add a new record to a database -- put the record into the database, and get its new key nrs := bs_length(record_string); bs_set_slice(record_string,nrs + 1,nrs,bsr := binstr(rec)); -- insert the new key at the end of the record index, with its length -- for cumulation dbix_insert(record_index,OM,[key := stg_of_4(identifier_generator +:= 1),#bsr]); -- now calculate the string of the record and find the set of words wd_list := {wd: wd in breakup(to_string(rec),"\n\t\r ") | wd /= ""}; add_wds(wd_list,key); -- add these word occurences to the words_index words_just_changed := wd_list; return self; -- return the database itself as the result of this operation end; procedure contains(wd); -- returns the 'contains' iterator object for a given word -- tuple to be supplied is [iterator_kind = kind,occs_tree,current_occ_ptr = OM,start,end] (since simple); -- (but [kind,L1_child,L2_child,last_L1,last_L2] if compound) --print("creating 'contains' iterator: ",wd); if (initial_item := wix_comp_cum2(words_index,wd)) = OM or initial_item(1) /= wd then -- the word index call wix_comp_cum2(rec,x) returns values of the form [rid,len,cum_len] --print("iteration will terminate"); return contains_iterator([simple,occ_rec_tup]); -- word does not occur, so iteration will terminate end if; [-,occ_section_len,the_occs_end] := initial_item; --print("iteration tuple is: ",[simple,occ_rec_tup,OM,the_occs_end - occ_section_len + 1,the_occs_end]); if the_occs_end > bswo_length(occ_rec_tup) then print("CREATING BAD CONTAINS_ITERATOR: ",the_occs_end - occ_section_len + 1," ",the_occs_end," ", bswo_length(occ_rec_tup)); stop; end if; return contains_iterator([simple,occ_rec_tup,OM,the_occs_end - occ_section_len + 1,the_occs_end]); end contains; procedure write_db(); -- write database to its file -- we only need to flush_all, write the refcount data to the file, and write one additional record -- containing (i) the location of the roots of the various database trees; (ii) the value of recno_generator; -- (iii) the location of the first record containing the refcount data; since the remainder of the -- database is already present in the file dr_flush_all(); --stg := dr_load(rec_one := "\x00\x00\x00\x01"); -- initialize the first record, which contains the needed locator data -- Don't load the record! -- Instead we pass a string containing the parameters. stg:=" "*32; stg(1..4) := record_string; -- bigstring containing records stg(5..8) := record_index; -- index for bigstring containing records stg(9..12) := occ_rec_tup; -- big tuple containing word occurences stg(13..16) := words_index; -- index for big tuple containing word occurences stg(17..21) := stg_of_5(identifier_generator); -- identifier_generator value; this is 5 bytes -- Recno generator is handled internally in the native package -- we don't have to save it. --stg(22..25) := stg_of_4(dr_recno_generator()); -- recno_generator value (maximum record used) --refcount_data_start := dr_write_refcount_data(); -- write out the refcount data --stg(26..30) := stg_of_4(refcount_data_start); -- location of the first record containing the refcount data --dr_load(rec_one); --dr_setrecbuf(rec_one,stg); --dr_dirtify(rec_one); --dr_flush_all(); -- write eveything out db_close_db(stg); -- close database end write_db; procedure read_db(file_name); -- read database from the indicated file -- we only need to reconstruct refcount data, free_list, and in_use data from the file, -- and set (i) the location of the roots of the various database trees; (ii) the value of recno_generator; -- since the remainder of the database is already present in the file print("reading database"); db_open_db(file_name); stg:=db_read_param(); record_string := stg(1..4); -- bigstring containing records record_index := stg(5..8); -- index for bigstring containing records occ_rec_tup := stg(9..12); -- big tuple containing word occurences words_index := stg(13..16); -- index for big tuple containing word occurences identifier_generator := int_of_5(stg(17..21)); -- identifier_generator value; this is 5 bytes --read_refcount_data(refcount_data_start,recno_generator); -- reconstruct the refcount data --print("read_db check: ",written_refcount(2..recno_generator) = refcount(2..recno_generator)," ",written_free_list = free_list," ",written_in_use = in_use); --print(str(written_refcount)," ",str(refcount)); --print(str([hexify(x) : x in written_free_list])," ",str([hexify(x) : x in free_list])); --print(str([hexify(x) : x in (written_in_use - in_use)])," ",str([hexify(x) : x in (in_use - written_in_use)])); end read_db; procedure write_refcount_data(); -- write out the refcount data -- we group the refcount data into a series of 4-byte fields within a chained list of blocs, -- fitting these into free-list elements where possible, but otherwise using new records. -- each block contains a pointer to the next block in the chain. if recno_generator = 1 then return; end if; -- no recno data to write written_refcount := refcount; written_free_list := free_list; written_in_use := in_use; --print("recno_generator: ",recno_generator," ",#refcount," ",str(refcount)," ",str([hexify(x): x in free_list])," ",str([hexify(x): x in in_use])); current_stg := dr_load(current_record := dr_new_rec()); -- allocate a first record to hold refcount data ref_data_start := current_record; place_in_current_record := 1; for j in [1..recno_generator] loop if (place_in_current_record +:= 4) > rec_size then -- must write another record -- SPAM dirty with:= current_record; -- note that current record has ween written next_record := current_stg(1..4) := dr_new_rec(); -- allocate another record to hold refcount data dr_load(current_record); -- prepare to write the next record dr_setrecbuf(current_record,current_stg); dr_dirtify(current_record); --print("record to be written: ",hexify(current_record)," ",hexify(current_stg)); current_stg := dr_load(current_record := next_record); -- start on the new string place_in_current_record := 5; -- restart the write-position index end if; current_stg(place_in_current_record..place_in_current_record + 3) := stg_of_4(refcount(j)?0); end loop; dr_setrecbuf(current_record,current_stg); dr_dirtify(current_record); --print("record to be written: ",hexify(current_record)," ",hexify(current_stg)); --print(hexify(current_record)," ",hexify(current_stg)); return ref_data_start; end write_refcount_data; procedure read_refcount_data(refcount_data_start,recno_generator); -- reconstruct the refcount data -- we read the refcount data into a series of 4-byte fields withing a chained list of blocs, -- fitting these into free-list elements where possible, but otherwise using new records. -- each block contains a pointer to the next block in the chain. free_list := []; in_use := { }; -- this routine also reconstructs the free_list tuple and in_use set place_in_current_record := 1; rno := int_of_4(refcount_data_start); gets(file_handle,(rno - 1) * rec_size + 1,rec_size,current_stg); -- read the current record --print("first record read: ",hexify(refcount_data_start)," ",hexify(current_stg)); next_record := current_stg(1..4); -- prepare to read the next record records_containing_refcounts := [refcount_data_start]; -- we need to collect these records, and free them --SPAM refcount := [1..recno_generator]; -- allocate a vector; its components will be over-written for j in [1..recno_generator] loop if (place_in_current_record +:= 4) > rec_size then -- must read next record place_in_current_record := 5; current_record := next_record; rno := int_of_4(current_record); gets(file_handle,(rno - 1) * rec_size + 1,rec_size,current_stg); -- read the current record --print("record read: ",hexify(current_record)," ",hexify(current_stg)); records_containing_refcounts with:= current_record; -- we need to collect these records, and free them next_record := current_stg(1..4); -- prepare to read the next record end if; -- SPAM refcount(j) := rcj := int_of_4(current_stg(place_in_current_record..place_in_current_record + 3)); if rcj = 0 then -- record belongs to free list free_list with:= stg_of_4(j); else -- record is in use in_use with:= stg_of_4(j); end if; end loop; for rec in records_containing_refcounts loop -- we free all these records, and put them on the free list spam:=0; -- SPAM refcount(int_of_4(rec)) := 0; free_list with:= rec; in_use less:= rec; end loop; free_list := [rc: rc in {rec : rec in free_list | (ir := int_of_4(rec)) <= recno_generator and ir /= 1}]; end read_refcount_data; procedure dump2(x); print("***** Record ***** ",hexify(x)," ",hexify(dr_load(x))); end; procedure dump(); -- dump of database -- we dump the database by iterating over its record index, -- and printing each successive record; and also over the words index, -- and printing the list of records containing occurences of each word print(what_tree," DATABASE RECORDS:"); --print("record_string: ",bs_length(record_string)," ",stg_from_bigstg(record_string)); --print("occ_rec_tup: ",hexify(stg_from_bswostg(occ_rec_tup))); cur_cum := 1; -- cumulant value of current record ril := dbix_get_cum(record_index); -- final cumulant value for whole records index bs_check_leaves(record_string); -- check the record string for consistency while cur_cum < ril loop -- loop over all records [rid,rid_len] := dbix_comp_cum(record_index,cur_cum); print("record ",int_of_4(rid),": [",cur_cum,"..",cur_cum + rid_len - 1,"] ", unbinstr(bs_slice(record_string,cur_cum,cur_cum + rid_len - 1))); cur_cum +:= rid_len; -- advance to the next record end loop; print("INDEXED WORDS AND WORD OCCURENCES:"); cur_cum := 1; -- cumulant value of current word wil := wix_get_cum(words_index); -- final cumulant value for whole words index while cur_cum <= wil loop -- loop over all words [word,num_occs,word_cum] := wix_comp_cum(words_index,cur_cum); word_tup := [int_of_4(bswo_comp(occ_rec_tup,j)): j in [cur_cum..word_cum]]; -- tuple of word occurences print(word,": (",cur_cum,"..",word_cum," )",word_tup); cur_cum := word_cum + 1; end loop; print(); return ""; end dump; procedure check_consistency(caption); -- miscellaneous consistency checks (DEBUGGING ONLY) if bswo_length(occ_rec_tup) < wix_get_cum(words_index) then print(caption," DATABASE INCONSISTENCY: OCCURENCE STRING LENGTH IS LESS THAN TOTAL WORD INDEX CUMULANT ", hexify(stg_from_bswostg(occ_rec_tup)),"\n",wix_dump(words_index)); stop; end if; end check_consistency; procedure dump_recstring(); -- dump of database record_string (DEBUGGING ONLY) print(bnr_dump(record_string)); end dump_recstring; end database; class active_database; -- 'active' database class, with hibernators procedure create(my_to_string,my_do_transaction,file_name); -- create or reopen an active database procedure write_act(); -- write database to its file procedure contains(wd); -- returns the 'contains' iterator object for a given word procedure watch(Tid,word_list,transaction_state); -- 'watch' routine, which causes a hybernator to sleep, watching a list of words procedure dump(); -- dump database (DEBUGGING) end active_database; class body active_database; -- 'active' database class, with hibernators use setldb,database; -- use the standard database class; there will be two of these use byteutil,disk_records_pak,db_records,db_iterator_pak; use string_utility_pak; var inner_database,hibernators; -- the actual database, and the hibernators watching and writing it var in_exec_loop := false; -- flag indicating whether or not we are in the execution loop of 'alert_hibs' var line_to_stg,do_transaction; -- the two procedural parameters passed to 'create' procedure create(my_to_string,my_do_transaction,file_name); -- create or reopen an active database -- note that the following lines will read and existing database pair unless file_name starts with '-' line_to_stg := my_to_string; do_transaction := my_do_transaction; -- save the procedure parameters inner_database := database(line_to_stg,file_name + ".db"); -- create an empty database, or read inner_database.what_tree := "inner"; -- for debugging hibernators := database(hiber_to_stg,file_name + ".hib"); -- create an empty hibernators database, or read hibernators.what_tree := "hibernators"; -- for debugging end create; procedure hiber_to_stg(tup); -- convert a hibernator's state tuple to an index string -- there are several cases. For 'newborn' hibernators, the tuple is [word_watched_for] -- For 'hibernating' hibernators, the tuple is [word_watched_for,state]. -- For active hibernators, the tuple is [".",wakeup_wd,state] -- For the currently executing hibernator (if any), the tuple is ["",wakeup_wd,state] return tup(1); -- we simply return the first component end hiber_to_stg; procedure write_act(); -- write an active database to its file inner_database.write_db(); hibernators.write_db(); -- write both parts end write_act; procedure dump(); -- dump database (DEBUGGING) print("INNER_DATABASE"); inner_database.dump(); print("HIBERNATORS"); hibernators.dump(); end dump; procedure contains(wd); -- returns the 'contains' iterator object for a given word if is_tuple(wd) then return hibernators.contains(wd); end if; -- iterate over the hibernators return inner_database.contains(wd); -- iterate over the inner database end contains; procedure self(key); -- find a database record or hibernator record given its key if is_tuple(key) then return hibernators(key(1)); end if; -- reference the hibernators return inner_database(key); -- get it from the inner database end; procedure self(key) := rec; -- update a database record or hybernator record using its key --print("assigning record: ",hexify(key)," ",rec); if is_tuple(key) then -- assignment to the hibernators hibernators(key(1)) := rec; hibernators.check_consistency("hibernators"); return self; end if; inner_database(key) := rec; -- write the inner database inner_database.check_consistency("inner_database"); alert_hibs(); -- alert the hibernators, which may modify the inner database end; procedure self with rec; -- add a new record or hybernator to a database -- for external use, we encode additons to the hibernators by wrapping them into singleton sets --print("new record: ",rec); if is_set(rec) and #rec = 1 then -- addition to the hibernators hibernators with:= arb(rec); hibernators.check_consistency("hibernators"); return self; end if; inner_database with:= rec; -- write the inner database inner_database.check_consistency("inner_database"); --print("after insertion of new record: "); inner_database.dump(); alert_hibs(); -- alert the hibernators, which may modify the inner database return self; end; procedure alert_hibs(); -- alert the hibernators, which may modify the inner database words_alerted := inner_database.words_just_changed; -- words affected by the last operation for wd in words_alerted loop -- alert all hibernators watching this word for hib in hibernators.contains(wd) loop hibernators(hib) := [".",wd,hibernators(hib)(2)]; -- those hibernating have '.' as first component -- save only the alerting word and the state, not the list of words being watched end loop; end loop; if in_exec_loop then return; end if; -- to eliminate useless recursion in_exec_loop := true; -- set the execution loop flag while (first_active := arb(hibernators.contains("."))) /= OM loop -- hibernators potentially active have '.' as first component hibernators(first_active) := [""] + hibernators(first_active)(2..); -- the hibernator which is executing has "" as first component, so is not sensitive to any word change -- i.e. now hibernators(first_active) is ["",wd,transaction_state] --print("activating: ",hexify(first_active)," ",hibernators(first_active)," ",hibernators.dump()); -- incref(old_inner_database := inner_database,1); incref(old_hibernators := hibernators,1); -- note two more references modified_db := do_transaction(first_active,self); -- and call the transaction (cautiously); this will modify a copy of the database, -- but not the hibernators inner_database := modified_db.inner_database; hibernators := modified_db.hibernators; -- copy the components of the modified database -- incref(old_inner_database,-1); incref(old_hibernators,-1); -- drop these two references end loop; in_exec_loop := false; -- drop the execution loop flag end alert_hibs; procedure watch(Tid,word_list,transaction_state); -- 'watch' routine, which causes a hybernator to sleep, -- watching a list of words -- As the Tid of a transaction (i.e. hibernator), it is best to use its record id. -- When the transaction is active but not running, its record is [".",transaction_state], -- and it is simply indexed by ".". When it is hibernating, and watching the words of word_list, -- it is indexed by all the words in word_list. To find all the transactions hibernating on the word wd, -- we examine the iterators -- rid in in hibernators.contains(wd) -- and execute -- for rid in in hibernators.contains(wd) loop db(rid) := [".",[wd],hibernators(hib)(2)] end loop; -- which makes the transaction potentially active. -- To establish a 'watch', we simply execute --print("watch: ",hexify(Tid)," ",word_list," ",transaction_state); hibernators(Tid) := [word_list(1),transaction_state]; -- this drops the hibernator from active state -- a hibernator which is watching has the (unique) word for which it is watching as its first component return self; -- return the database as it now is end watch; end active_database; program test; -- test program for SETL database class use database; -- use the SETL database class use string_utility_pak; db := database(pair_to_stg,"-disk_record_file"); -- create an empty database, to store test pairs db with:= ["Jack",68]; --print("first pair added\n",db.dump()); -- add a first pair db with:= ["Diana",41]; --print("second pair added\n",db.dump()); -- add a second pair db with:= ["Rachel",41]; --print("third pair added\n",db.dump()); -- add a third pair print("database with 3 records",db.dump()); print("testing arb: list a record containing 'Jack': ",db(jack_key := arb(db.contains("N:Jack")))); print("testing arb: list a record containing 'Diana': ",db(arb(db.contains("N:Diana")))); print("testing arb: list a record containing 'A:68': ",db(arb(db.contains("A:68")))); print("testing arb: list a record containing 'A:41': ",db(arb(db.contains("A:41")))); print("testing arb: count records containing 'A:41': ",#db.contains("A:41")); print("testing change of record, set ['Jack',68] to ['Jack',69]"); db(jack_key) := ["Jack",69]; -- another year gone by print(db.dump()); print("testing arb after change: list a record containing 'N:Jack': ",db(arb(db.contains("N:Jack")))); print("testing arb after change: list a record containing 'A:68': ",a_rec := arb(db.contains("A:68"))," ", if a_rec /= OM then db(a_rec) else "Non-existent ****" end if); print("testing arb after change: list a record containing 'A:69': ",db(arb(db.contains("A:69")))); print("testing arb after change: list a record containing 'A:41': ",a_rec := arb(db.contains("A:41"))," ",if a_rec /= OM then db(a_rec) else "Non-existent ****" end if); print("testing arb after change: list a record containing 'N:Diana': ",db(arb(db.contains("N:Diana")))); -- now start testing the basic iterators print("testing the basic iterators: list record keys containing 'Jack'"); for key in (jack_recs := db.contains("N:Jack")) loop print(hexify(key)); end loop; print("testing the basic iterators2: list records containing 'Jack'"); for key in (jack_recs := db.contains("N:Jack")) loop print(db(key)); end loop; print("testing the basic iterators3: list records containing 'Diana'"); for key in (jack_recs := db.contains("N:Diana")) loop print(db(key)); end loop; print("testing the basic iterators4: list records containing '41'"); for key in (jack_recs := db.contains("A:41")) loop print(db(key)); end loop; print("\n****** Union test - records containing 'Jack' or 'Diana' ******"); for key in (jd_recs := (db.contains("N:Jack") + db.contains("N:Diana"))) loop print(hexify(key)," ",db(key)); end loop; print("\n****** Union test - records containing 'Rachel' or 'Diana' ******"); for key in (jd_recs := (db.contains("N:Rachel") + db.contains("N:Diana"))) loop print(hexify(key)," ",db(key)); end loop; print("testing arb for union: list a record containing 'Jack' or 'Diana': ",key := arb jd_recs," ",db(key)); print("\n****** Intersection test - records containing '41' and 'Diana' ******"); for key in (jd_recs := (db.contains("A:41") * db.contains("N:Diana"))) loop print(hexify(key)," ",db(key)); end loop; print("\n****** Intersection test - records containing '41' and 'Rachel' ******"); for key in (jd_recs := (db.contains("A:41") * db.contains("N:Rachel"))) loop print(hexify(key)," ",db(key)); end loop; print("testing arb for Intersection: list a record containing '41' and 'Rachel': ",key := arb jd_recs," ",db(key)); print("\n****** Difference test - records containing '41' andnot 'Diana' ******"); for key in (jd_recs := (db.contains("A:41") - db.contains("N:Diana"))) loop print(hexify(key)," ",db(key)); end loop; print("\n****** Difference test - records containing '41' andnot 'Rachel' ******"); for key in (jd_recs := (db.contains("A:41") - db.contains("N:Rachel"))) loop print(hexify(key)," ",db(key)); end loop; db with:= ["Jack", 6]; -- add a younger jack print("testing the basic iterators5: list records containing '6'"); for key in (jack_recs := db.contains("A:6")) loop print(db(key)); end loop; print("****** Print all records containing 'Jack' ******"); for key in (jack_recs := db.contains("N:Jack")) loop print(db(key)); end loop; print("recs with Jack and '69': ",hexify(jack_rec69 := arb(db.contains("N:Jack") * db.contains("A:69")))); if jack_rec69 /= OM then print(db(jack_rec69)); end if; print("recs with Jack and '6': ",hexify(jack_rec6 := arb(db.contains("N:Jack") * db.contains("A:6")))); if jack_rec6 /= OM then print(db(jack_rec6)); end if; print("\n****** Difference test ******"); print("recs with Jack but not '6'"); for x in (jnd_recs := db.contains("N:Jack") - db.contains("A:6")) loop print(db(x)); end loop; print("\n****** Intersection test ******"); print("recs with Jack and '6'"); for x in (jnd_recs := db.contains("N:Jack") * db.contains("A:6")) loop print(db(x)); end loop; print("\n****** Deleting a record -- the 'Jack' with A:69 ******"); jack_rec69 := jack_key := arb(db.contains("N:Jack") * db.contains("A:69")); db(jack_rec69) := OM; print(db.dump()); print("remaining recs with Jack"); for x in (jack_recs := db.contains("N:Jack")) loop print(db(x)); end loop; print("\n****** Restoration Test - Restoring ['Jack',68]; plus Difference Test ******"); db with:= ["Jack",68]; -- the other Jack again print(db.dump()); print("Records with 'Jack' but not 'Diana'"); for x in (jnd_recs := db.contains("N:Jack") - db.contains("N:Diana")) loop print(db(x)); end loop; print("\n****** Deleting another record -- the 'Jack' with A:6 ******"); print("Record identifier is: ",hexify(lit_jack_rec := arb(db.contains("N:Jack") * db.contains("A:6")))); print("Deleting ",db(lit_jack_rec)); db(lit_jack_rec) := OM; print(db.dump()); print("Records with 'Jack'"); for x in (jack_recs := db.contains("N:Jack")) loop print(db(x)); end loop; for j in [1..20] loop db with:= ["Jack",j]; db with:= ["Diana",j]; db with:= ["Rachel",j]; end loop; print("pairs added\n",db.dump()); print("records with 'A:3'"); print(str([db(x): x in (jnd_recs := db.contains("A:3"))])); print("records with 'N:Diana'"); print(str([db(x): x in (jnd_recs := db.contains("N:Diana"))])); print("records with 'A:3' but not 'N:Diana'"); print(str([db(x): x in (jnd_recs := db.contains("A:3") - db.contains("N:Diana"))])); print("records with 'N:Diana' but not 'A:3'"); print(str([db(x): x in (jnd_recs := db.contains("N:Diana") - db.contains("A:3"))])); print("records with 'N:Diana' and 'A:3'"); print(str([db(x): x in (jnd_recs := db.contains("N:Diana") * db.contains("A:3"))])); print("records with 'N:Diana' or 'A:3'"); print(str([db(x): x in (jnd_recs := db.contains("N:Diana") + db.contains("A:3"))])); print("\n Memory Leak Tests"); print("TO BE ADDED"); print("\n Database Copy Tests"); print("TO BE ADDED"); print("\n ***********END OF TESTS ***********"); db.write_db(); stop; procedure pair_to_stg(p); -- convert name, age pair to string [nm,age] := p; return " N:" + nm + " A:" + str(age); end pair_to_stg; end test; -- Distributed versions of the proposed databases can be organized as sets of records -- maintained independently on multiple computers, each of which stores a section -- of the big_string representing the database, and an index to the words -- in this section of the big_string. parallel searches over these -- separate sections generate sets, mustly null, which are combined into -- larger sets. The search can be optimized by storing hints of the form -- wd:* on the processors which distribute the queries. If no word of this -- form is present on a processor P when a search for a string starting in this way is -- being distributed, P need not receive a copy of the query. -- if the free_list is empty when a section is requested, a 'space exhausted' -- abort results. program test2; -- test program for SETL database class use database; -- use the SETL database class use string_utility_pak; db := database(pair_to_stg,"-disk_record_file"); -- create an empty database, to store test pairs db with:= ["King",212,"5001317"]; db with:= ["Queen",718,"2681702"]; db with:= ["Slave",212,"9983470"]; print("database with 3 records",db.dump()); stop; stop; procedure pair_to_stg(p); -- convert name, age pair to string [nm,ac,ph] := p; return " N_" + nm + " AC_" + str(ac)+" P_"+ph; end pair_to_stg; end test2;