#!/usr/bin/perl
#
#           RiLax 0.1.03
#
# plain text database search engine, 
# version 0.1
# (c) Sergej Tarasov, 2001
#
# Homepage: http://risearch.org/
# email: risearch@risearch.org
# Last modified: 08.03.2002



require './config.pl';
$| = 1;

if (exists($ENV{'GATEWAY_INTERFACE'})) {print "Content-Type: text/plain\n\n"}

use Cwd;
print "Current directory: ",cwd,"\n";


$code = "\${\$_[0]} =~ tr/-a-zA-Z$CAP_LETTERS$LOW_LETTERS$numbers/ /cs;";
$remove_non_alphabetic = eval "sub { $code }";

$code = "\${\$_[0]} =~ tr/A-Z$CAP_LETTERS/a-z$LOW_LETTERS/;";
$to_lower_case = eval "sub { $code }";


unless ( -e "db" && -d "db" ) {
    mkdir("db",0777) or &my_die("Died: could not create directory 'db' - $!");
    print "Directory 'db' created\n";
}

open SITEWORDS, ">$SITEWORDS" or &my_die("Died: could not open $SITEWORDS");
binmode(SITEWORDS);
open WORD_IND, ">$WORD_IND" or &my_die("Died: could not open $WORD_IND");
binmode(WORD_IND);


$time1 = time;
@time=localtime($time1);
$time="$time[2]:$time[1]:$time[0]";
print "Reading file: $time\n";


&read_db;

$time2 = time;
@time=localtime($time2);
$time="$time[2]:$time[1]:$time[0]";
print "Scan finished: $time\n";
print "Creating databases. Please wait, this can take several minuts.\n";

close(FINFO);

print "Writing SITEWORDS\n";
    $pos_sitewords = tell(SITEWORDS);
    $pos_word_ind  = tell(WORD_IND);
    $to_print_sitewords = "";
    $to_print_word_ind  = "";
    foreach $word (sort keys %words) {
    	$cwn++;
    	$words_word_dum = pack("NN",$pos_sitewords+length($to_print_sitewords),
    	                        $pos_word_ind+length($to_print_word_ind));
    	$to_print_sitewords .= "$word\x0A";
    	$to_print_word_ind .= pack("N",length($words{$word})/4).$words{$word};
    	$words{$word} = $words_word_dum;
    	if (length($to_print_word_ind) > 32000) {
    	    print SITEWORDS $to_print_sitewords;
    	    print WORD_IND  $to_print_word_ind;
    	    $to_print_sitewords = "";
    	    $to_print_word_ind  = "";
    	    $pos_sitewords = tell(SITEWORDS);
    	    $pos_word_ind  = tell(WORD_IND);
    	}
    };
    print SITEWORDS $to_print_sitewords;
    print WORD_IND  $to_print_word_ind;

close(SITEWORDS);
close(WORD_IND);


$time3 = time;
@time=localtime($time3);
$time="$time[2]:$time[1]:$time[0]";
print "Building HASH - $time\n";

    &build_hash;

$time4 = time;
@time=localtime($time4);
$time="$time[2]:$time[1]:$time[0]";
print "\nIndexing finished: $time\n";


@time=gmtime($time4-$time1);
$time="$time[2]:$time[1]:$time[0]";
print "Total time: $time sec.\n";


#=====================================================================

sub read_db {

local $/ = $record_separator;
open FILE, "$DB_NAME" or &my_die("Died: could not open file '$DB_NAME' - $!");

$pos = pack("N",0);
while (<FILE>){
    $str = $_;
    $rec_num++;

    if ( ($rec_num % 100) == 0 ) { print "Record - $rec_num\n" }
    
    @data = split /$field_separator/, $str;
    
    $data = "";
    foreach $field (@index_fileds) {
    	$data .= " ".$data[$field];
    }

    &$remove_non_alphabetic(\$data);
    &$to_lower_case(\$data);

    $wwd = join " ", ($data =~ m/([^- ]+-[^ ]+[^- ])/gs);
    $data =~ tr/-/ /;
    $data .= " ".$wwd;
        
    my %seen = ();
    @seen{split (/\s+/,$data)} = ();
    foreach (keys %seen) {
        if (length($_) < $min_length) { next }
        if (exists($stop_words{$_})) { next }
        $words{$_}.= $pos;
    }
    $pos=pack("N",tell(FILE));

};

}; #sub read_db
#=====================================================================

sub build_hash {

    for ($i=0; $i<$HASHSIZE; $i++) {$hash_array[$i] = ""};
    foreach $word (sort keys %words) {
        if ($INDEXING_SCHEME == 3) { $subbound = length($word)-3 }
        else { $subbound = 1 }
        if (length($word)==3) {$subbound = 1}
        $substring_length = 4;
        if ($INDEXING_SCHEME == 1) { $substring_length = length($word) }
        
        for ($i=0; $i<$subbound; $i++){
    	    $hash_value = &hash(substr($word,$i,$substring_length));
    	    $hash_array[$hash_value] .= ($words{$word});
    	};   # for $i
    };   # foreach $word
    
    open HASH, ">$HASH" or &my_die("Died: could not open $HASH");
    binmode(HASH);
    open HASHWORDS, ">$HASHWORDS" or &my_die("Died: could not open $HASHWORDS");
    binmode(HASHWORDS);

    $zzz = pack("N", 0);
    print HASHWORDS $zzz;
    $pos_hashwords  = tell(HASHWORDS);
    $to_print_hash = "";
    $to_print_hashwords = "";
    for ($i=0; $i<$HASHSIZE; $i++){
    	
        if ($hash_array[$i] eq "") {$to_print_hash .= $zzz};
        if ($hash_array[$i] ne "") {
            $to_print_hash .= pack("N",$pos_hashwords + length($to_print_hashwords));
            $to_print_hashwords .= pack("N", length($hash_array[$i])/8).$hash_array[$i]
        };   
        if (length($to_print_hashwords) > 64000) {
            print HASH $to_print_hash;
            print HASHWORDS $to_print_hashwords;
            $to_print_hash = "";
            $to_print_hashwords = "";
            $pos_hashwords  = tell(HASHWORDS);
        }
    }; # for $i
    print HASH $to_print_hash;
    print HASHWORDS $to_print_hashwords;
    
close(HASH);
close(HASHWORDS);

};     # sub build_hash
#=====================================================================

sub my_die {
   my ($str) = @_;
   print "$str\n";
   die
}
#===================================================================

