koha-cvs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Koha-cvs] CVS: koha/C4 SearchMarcTest.pm,NONE,1.1


From: Paul POULAIN
Subject: [Koha-cvs] CVS: koha/C4 SearchMarcTest.pm,NONE,1.1
Date: Fri, 27 May 2005 02:30:26 -0700

Update of /cvsroot/koha/koha/C4
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv1631/C4

Added Files:
        SearchMarcTest.pm 
Log Message:
proof of concept for inverted index tables for search

how it works :
* create the table marc_Tword with the following structure :
CREATE TABLE `marc_Tword` (
  `word` varchar(80) NOT NULL default '',
  `usedin` text NOT NULL,
  `tagsubfield` varchar(4) NOT NULL default '',
  PRIMARY KEY  (`word`,`tagsubfield`)
) TYPE=MyISAM;
* open a console & type export PERL5LIB & export KOHA_CONF as usual.
* fill this table with misc/build_marc_Tword.pl. Warning, this script uses a 
very very consumming but very fast method to fill the table : it does 
everything in memory, then write everything. Another method is provided (& 
commented), but it's 100x times slower (really !)
* open opac-search.pl and replace use C4::SearchMarc; by use 
C4::SearchMarcTest; as the API hasn't changed, it will work immediatly.
* go to opac-search (advanced search) & search whatever you want. Should work 
fine.

LIMITS :
* build_marc_Tword has problem with extended chars (accented ones mainly). So 
don't be afraid if you get sql errors. They are not a problem for a POC
* search works always order by title, whatever you choose.
* search works only search WORDA and WOARDB, not yet WORDA or WORDB or WORDA 
except WORDB.


--- NEW FILE ---
package C4::SearchMarcTest;

# Copyright 2000-2002 Katipo Communications
#
# This file is part of Koha.
#
# Koha is free software; you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.
#
# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
# Suite 330, Boston, MA  02111-1307 USA

use strict;
require Exporter;
use DBI;
use C4::Context;
use C4::Biblio;
use C4::Date;
use Date::Manip;

use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

# set the version for version checking
$VERSION = 0.02;

=head1 NAME

C4::Search - Functions for searching the Koha MARC catalog

=head1 FUNCTIONS

This module provides the searching facilities for the Koha MARC catalog

A COPY of official SearchMarc, with some tests for inverted index table
works only with 1 MARC tag/subfield

=cut

@ISA = qw(Exporter);
@EXPORT = qw(&catalogsearch &findseealso &findsuggestion &getMARCnotes 
&getMARCsubjects);

=head1 findsuggestion($dbh,$values);

=head2 $dbh is a link to the DB handler.

use C4::Context;
my $dbh =C4::Context->dbh;

=head2 $values is a word

Searches words with the same soundex, ordered by frequency of use.
Useful to suggest other searches to the users.

=cut

sub findsuggestion {
        my ($dbh,$values) = @_;
        my $sth = $dbh->prepare("SELECT count( * ) AS total, word FROM 
marc_word WHERE sndx_word = soundex( ? ) AND word <> ? GROUP BY word ORDER BY 
total DESC");
        my @results;
        for(my $i = 0 ; $i <= $#{$values} ; $i++) {
                if (length(@$values[$i]) >=5) {
                        $sth->execute(@$values[$i],@$values[$i]);
                        my $resfound = 1;
                        my @resline;
                        while ((my ($count,$word) = $sth->fetchrow) and 
$resfound <=10) {
                                push @results, "@$values[$i]|$word|$count";
#                               address@hidden = address@hidden;
                                $resfound++;
                        }
                }
        }
        return address@hidden;
}

=head1 findseealso($dbh,$fields);

=head2 $dbh is a link to the DB handler.

use C4::Context;
my $dbh =C4::Context->dbh;

=head2 $fields is a reference to the fields array

This function modify the @$fields array and add related fields to search on.

=cut

sub findseealso {
        my ($dbh, $fields) = @_;
        my $tagslib = MARCgettagslib ($dbh,1);
        for (my $i=0;$i<=$#{$fields};$i++) {
                my ($tag) =substr(@$fields[$i],1,3);
                my ($subfield) =substr(@$fields[$i],4,1);
                @$fields[$i].=','.$tagslib->{$tag}->{$subfield}->{seealso} if 
($tagslib->{$tag}->{$subfield}->{seealso});
        }
}

=head1  my ($count, @results) = catalogsearch($dbh, $tags, $and_or, $excluding, 
$operator, $value, $offset,$length,$orderby);

=head2 $dbh is a link to the DB handler.

use C4::Context;
my $dbh =C4::Context->dbh;

$tags,$and_or, $excluding, $operator, $value are references to array

=head2 $tags

contains the list of tags+subfields (for example : address@hidden = '200a')
A field can be a list of fields : '200f','700a','700b','701a','701b'

Example

=head2 $and_or

contains  a list of strings containing and or or. The 1st value is useless.

=head2 $excluding

contains 0 or 1. If 1, then the request is negated.

=head2 $operator

contains contains,=,start,>,>=,<,<= the = and start work on the complete 
subfield. The contains operator works on every word in the subfield.

examples :
contains home, search home anywhere.
= home, search a string being home.

=head2 $value

contains the value to search
If it contains a * or a %, then the search is partial.

=head2 $offset and $length

returns $length results, beginning at $offset

=head2 $orderby

define the field used to order the request. Any field in the biblio/biblioitem 
tables can be used. DESC is possible too

(for example title, title DESC,...)

=head2 RETURNS

returns an array containing hashes. The hash contains all biblio & biblioitems 
fields and a reference to an item hash. The "item hash contains one line for 
each callnumber & the number of items related to the callnumber.

=cut

=head2 my $marcnotesarray = &getMARCnotes($dbh,$bibid,$marcflavour);

Returns a reference to an array containing all the notes stored in the MARC 
database for the given bibid.
$marcflavour ("MARC21" or "UNIMARC") determines which tags are used for 
retrieving subjects.

=head2 my $marcsubjctsarray = &getMARCsubjects($dbh,$bibid,$marcflavour);

Returns a reference to an array containing all the subjects stored in the MARC 
database for the given bibid.
$marcflavour ("MARC21" or "UNIMARC") determines which tags are used for 
retrieving subjects.

=cut

sub catalogsearch {
        my ($dbh, $tags, $and_or, $excluding, $operator, $value, 
$offset,$length,$orderby,$desc_or_asc) = @_;
        # "Normal" statements
        my @normal_tags = ();
        my @normal_and_or = ();
        my @normal_operator = ();
        my @normal_value = ();
        # Extracts the NOT statements from the list of statements
        my @not_tags = ();
        my @not_and_or = ();
        my @not_operator = ();
        my @not_value = ();
        my $any_not = 0;
        $orderby = "biblio.title" unless $orderby;
        $desc_or_asc = "ASC" unless $desc_or_asc;

# the item.notforloan contains an integer. Every value <>0 means "book 
unavailable for loan".
# but each library can have it's own table of meaning for each value. Get them
# 1st search if there is a list of authorised values connected to 
items.notforloan
        my $sth = $dbh->prepare('select authorised_value from 
marc_subfield_structure where kohafield="items.notforloan"');
        $sth->execute;
        my %notforloanstatus;
        my ($authorised_valuecode) = $sth->fetchrow;
        if ($authorised_valuecode) {
                $sth = $dbh->prepare("select authorised_value,lib from 
authorised_values where category=?");
                $sth->execute($authorised_valuecode);
                while (my ($authorised_value,$lib) = $sth->fetchrow) {
                        $notforloanstatus{$authorised_value} = 
$lib?$lib:$authorised_value;
                }
        }
#
#
# marc_T_word PROOF OF CONCEPT BEGINNING
#
# fixme : only do a search on "contains every word"
# misses : 
# - begins or is equal to
# - excluding
# - or
        # the global array result.
        my @result;
        for(my $i = 0 ; $i <= $#{$value} ; $i++)
        {
                # replace * by %
                @$value[$i] =~ s/\*/%/g;
                # remove % at the beginning
                @$value[$i] =~ s/^%//g;
            @$value[$i] =~ s/(\.|\?|\:|\!|\'|,|\-|\"|\(|\)|\[|\]|\{|\}|\/)/ /g 
if @$operator[$i] eq "contains";
                foreach my $word (split(/ /, @$value[$i]))      # if operator 
is contains, splits the words in separate requests
                        {
                        # the array from this word
                        my @thiswordresults;
                        my $Tquery = $dbh->prepare("select tagsubfield,usedin 
from marc_Tword where tagsubfield in (@$tags[$i]) and word like ?");
                        $Tquery->execute($word);
                        warn "EXECUTING select tagsubfield,usedin from 
marc_Tword where tagsubfield in (@$tags[$i]) and word like $word";
                        # get the list of biblionumber - title
                        while (my ($tagsubfield,$usedin) = $Tquery->fetchrow) {
#                               warn "$word with "address@hidden" used in 
$usedin";
                                # split it in an array
                                my @lines = split /,/,$usedin;
                                # and copy it to an hash.
                                foreach my $line (@lines) {
#                                       warn "PUSHING $line" if $line; # the if 
$line avoid pushing the 1st entry, that is empty (usedin begins by a ,)
                                        push @thiswordresults, $line if $line;
                                }
                        }
                        # now, as it's a AND, merge %results & %thiswordresults 
in 1 hash
                        @result = @thiswordresults if $#result<0; #for the 1st 
loop, fill the global array
                        my %intersect;
                        my %union;
                        my $x; # temp variable
                        foreach $x (@result, @thiswordresults) {
                                $union{$x}++ && $intersect{$x}++;
                        }
                        @result = keys %intersect;
                }
        }

        # we have biblionumber array. 
        # now, sort it
        my @result = sort @result;
        
        #Now, loads title and author from [offset] to [offset]+[length]
        my $counter = $offset;
        # HINT : biblionumber as bn is important. The hash is fills 
biblionumber with items.biblionumber.
        # so if you dont' has an item, you get a not nice empty value.
        $sth = $dbh->prepare("SELECT biblio.biblionumber as 
bn,biblioitems.*,biblio.*, 
marc_biblio.bibid,itemtypes.notforloan,itemtypes.description
                                                        FROM biblio, 
marc_biblio 
                                                        LEFT JOIN biblioitems 
on biblio.biblionumber = biblioitems.biblionumber
                                                        LEFT JOIN itemtypes on 
itemtypes.itemtype=biblioitems.itemtype
                                                        WHERE 
biblio.biblionumber = marc_biblio.biblionumber AND biblio.biblionumber = ?");
#
#
# marc_Tword Proof of concept
#
#
my $subtitle;
        my $sth_subtitle = $dbh->prepare("SELECT subtitle FROM bibliosubtitle 
WHERE biblionumber=?"); # Added BY JF for Subtitles
        my @finalresult = ();
        my @CNresults=();
        my $totalitems=0;
        my $oldline;
        my ($oldbibid, $oldauthor, $oldtitle);
        my $sth_itemCN = $dbh->prepare("select items.* from items where 
biblionumber=? and (itemlost = 0 or itemlost is NULL)");
        my $sth_issue = $dbh->prepare("select date_due,returndate from issues 
where itemnumber=?");
        # parse all biblios between start & end.
        warn "RESULT SIZE : ".$#result;
        while (($counter <= $#result) && ($counter <= ($offset + $length))) {
                # search & parse all items & note itemcallnumber
                # 1st, get the biblionumber
                $result[$counter] =~ /(.*)-(.*)/;
                $sth->execute($2);
                warn "EXECUTING SELECT biblio.biblionumber as 
bn,biblioitems.*,biblio.*, 
marc_biblio.bibid,itemtypes.notforloan,itemtypes.description FROM biblio, 
marc_biblio LEFT JOIN biblioitems on biblio.biblionumber = 
biblioitems.biblionumber LEFT JOIN itemtypes on 
itemtypes.itemtype=biblioitems.itemtype WHERE biblio.biblionumber = 
marc_biblio.biblionumber AND biblio.biblionumber = $2";
                my $continue=1;
                my $line = $sth->fetchrow_hashref;
                my $biblionumber=$line->{bn};
        # Return subtitles first ADDED BY JF
                $sth_subtitle->execute($biblionumber);
                warn "EXECUTING SELECT subtitle FROM bibliosubtitle WHERE 
biblionumber=$biblionumber";
                my $subtitle_here.= $sth_subtitle->fetchrow." ";
                chop $subtitle_here;
                $subtitle = $subtitle_here;
        # /ADDED BY JF

#               $continue=0 unless $line->{bn};
#               my $lastitemnumber;
                $sth_itemCN->execute($biblionumber);
                warn "EXECUTING itemCN select items.* from items where 
biblionumber=$biblionumber and (itemlost = 0 or itemlost is NULL)";
                my @CNresults = ();
                my $notforloan=1; # to see if there is at least 1 item that can 
be issued
                while (my $item = $sth_itemCN->fetchrow_hashref) {
                        # parse the result, putting holdingbranch & 
itemcallnumber in separate array
                        # then all other fields in the main array
                        
                        # search if item is on loan
                        my $date_due;
                        $sth_issue->execute($item->{itemnumber});
                        warn "EXECUTING ISSUES select date_due,returndate from 
issues where itemnumber=".$item->{itemnumber};
                        while (my $loan = $sth_issue->fetchrow_hashref) {
                                if ($loan->{date_due} and !$loan->{returndate}) 
{
                                        $date_due = $loan->{date_due};
                                }
                        }
                        # store this item
                        my %lineCN;
                        $lineCN{holdingbranch} = $item->{holdingbranch};
                        $lineCN{itemcallnumber} = $item->{itemcallnumber};
                        $lineCN{location} = $item->{location};
                        $lineCN{date_due} = format_date($date_due);
                        $lineCN{notforloan} = 
$notforloanstatus{$line->{notforloan}} if ($line->{notforloan}); # setting not 
forloan if itemtype is not for loan
                        $lineCN{notforloan} = 
$notforloanstatus{$item->{notforloan}} if ($item->{notforloan}); # setting not 
forloan it this item is not for loan
                        $notforloan=0 unless ($item->{notforloan} or 
$item->{wthdrawn} or $item->{itemlost});
                        push @CNresults,\%lineCN;
                        $totalitems++;
                }
                # save the biblio in the final array, with item and item issue 
status
                my %newline;
                %newline = %$line;
                $newline{totitem} = $totalitems;
                # if $totalitems == 0, check if it's being ordered.
                if ($totalitems == 0) {
                        my $sth = $dbh->prepare("select count(*) from aqorders 
where biblionumber=? and datecancellationprinted is NULL");
                        $sth->execute($biblionumber);
                        warn "EXECUTING select count(*) from aqorders where 
biblionumber=$biblionumber and datecancellationprinted is NULL";
                        my ($ordered) = $sth->fetchrow;
                        $newline{onorder} = 1 if $ordered;
                }
                $newline{biblionumber} = $biblionumber;
                $newline{norequests} = 0;
                $newline{norequests} = 1 if ($line->{notforloan}); # itemtype 
not issuable
                $newline{norequests} = 1 if (!$line->{notforloan} && 
$notforloan); # itemtype issuable but all items not issuable for instance
                $newline{subtitle} = $subtitle;  # put the subtitle in ADDED BY 
JF

                my @CNresults2= @CNresults;
                $newline{CN} = address@hidden;
                $newline{'even'} = 1 if $#finalresult % 2 == 0;
                $newline{'odd'} = 1 if $#finalresult % 2 == 1;
                $newline{'timestamp'} = format_date($newline{timestamp});
                @CNresults = ();
                push @finalresult, \%newline;
                $totalitems=0;
                $counter++;
        }
        my $nbresults = $#result+1;
        return (address@hidden, $nbresults);
}

# Creates the SQL Request

sub create_request {
        my ($dbh,$tags, $and_or, $operator, $value) = @_;

        my $sql_tables; # will contain marc_subfield_table as m1,...
        my $sql_where1; # will contain the "true" where
        my $sql_where2 = "("; # will contain m1.bibid=m2.bibid
        my $nb_active=0; # will contain the number of "active" entries. an 
entry is active if a value is provided.
        my $nb_table=1; # will contain the number of table. ++ on each entry 
EXCEPT when an OR  is provided.

        my $maxloop=8; # the maximum number of words to avoid a too complex 
search.
        $maxloop = @$value if @$value<$maxloop;
        
        for(my $i=0; $i<=$maxloop;$i++) {
                if (@$value[$i]) {
                        $nb_active++;
                        if ($nb_active==1) {
                                if (@$operator[$i] eq "start") {
                                        $sql_tables .= "marc_subfield_table as 
m$nb_table,";
                                        $sql_where1 .= "(m1.subfieldvalue like 
".$dbh->quote("@$value[$i]%");
                                        if (@$tags[$i]) {
                                                $sql_where1 .=" and 
concat(m1.tag,m1.subfieldcode) in (@$tags[$i])";
                                        }
                                        $sql_where1.=")";
                                } elsif (@$operator[$i] eq "contains") {
                                        $sql_tables .= "marc_word as 
m$nb_table,";
                                        $sql_where1 .= "(m1.word  like 
".$dbh->quote("@$value[$i]");
                                        if (@$tags[$i]) {
                                                 $sql_where1 .=" and 
m1.tagsubfield in (@$tags[$i])";
                                        }
                                        $sql_where1.=")";
                                } else {
                                        $sql_tables .= "marc_subfield_table as 
m$nb_table,";
                                        $sql_where1 .= "(m1.subfieldvalue 
@$operator[$i] ".$dbh->quote("@$value[$i]");
                                        if (@$tags[$i]) {
                                                 $sql_where1 .=" and 
concat(m1.tag,m1.subfieldcode) in (@$tags[$i])";
                                        }
                                        $sql_where1.=")";
                                }
                        } else {
                                if (@$operator[$i] eq "start") {
                                        $nb_table++;
                                        $sql_tables .= "marc_subfield_table as 
m$nb_table,";
                                        $sql_where1 .= "@$and_or[$i] 
(m$nb_table.subfieldvalue like ".$dbh->quote("@$value[$i]%");
                                        if (@$tags[$i]) {
                                                $sql_where1 .=" and 
concat(m$nb_table.tag,m$nb_table.subfieldcode) in (@$tags[$i])";
                                        }
                                        $sql_where1.=")";
                                        $sql_where2 .= 
"m1.bibid=m$nb_table.bibid and ";
                                } elsif (@$operator[$i] eq "contains") {
                                        if (@$and_or[$i] eq 'and') {
                                                $nb_table++;
                                                $sql_tables .= "marc_word as 
m$nb_table,";
                                                $sql_where1 .= "@$and_or[$i] 
(m$nb_table.word like ".$dbh->quote("@$value[$i]");
                                                if (@$tags[$i]) {
                                                        $sql_where1 .=" and 
m$nb_table.tagsubfield in(@$tags[$i])";
                                                }
                                                $sql_where1.=")";
                                                $sql_where2 .= 
"m1.bibid=m$nb_table.bibid and ";
                                        } else {
                                                $sql_where1 .= "@$and_or[$i] 
(m$nb_table.word like ".$dbh->quote("@$value[$i]");
                                                if (@$tags[$i]) {
                                                        $sql_where1 .="  and 
m$nb_table.tagsubfield in (@$tags[$i])";
                                                }
                                                $sql_where1.=")";
                                                $sql_where2 .= 
"m1.bibid=m$nb_table.bibid and ";
                                        }
                                } else {
                                        $nb_table++;
                                        $sql_tables .= "marc_subfield_table as 
m$nb_table,";
                                        $sql_where1 .= "@$and_or[$i] 
(m$nb_table.subfieldvalue @$operator[$i] ".$dbh->quote(@$value[$i]);
                                        if (@$tags[$i]) {
                                                $sql_where1 .="  and 
concat(m$nb_table.tag,m$nb_table.subfieldcode) in (@$tags[$i])";
                                        }
                                        $sql_where2 .= 
"m1.bibid=m$nb_table.bibid and ";
                                        $sql_where1.=")";
                                }
                        }
                }
        }

        if($sql_where2 ne "(")  # some datas added to sql_where2, processing
        {
                $sql_where2 = substr($sql_where2, 0, (length($sql_where2)-5)); 
# deletes the trailing ' and '
                $sql_where2 .= ")";
        }
        else    # no sql_where2 statement, deleting '('
        {
                $sql_where2 = "";
        }
        chop $sql_tables;       # deletes the trailing ','
        return ($sql_tables, $sql_where1, $sql_where2);
}

sub getMARCnotes {
        my ($dbh, $bibid, $marcflavour) = @_;
        my ($mintag, $maxtag);
        if ($marcflavour eq "MARC21") {
                $mintag = "500";
                $maxtag = "599";
        } else {           # assume unimarc if not marc21
                $mintag = "300";
                $maxtag = "399";
        }

        my $sth=$dbh->prepare("SELECT subfieldvalue,tag FROM 
marc_subfield_table WHERE bibid=? AND tag BETWEEN ? AND ? ORDER BY tagorder");

        $sth->execute($bibid,$mintag,$maxtag);

        my @marcnotes;
        my $note = "";
        my $tag = "";
        my $marcnote;

        while (my $data=$sth->fetchrow_arrayref) {
                my $value=$data->[0];
                my $thistag=$data->[1];
                if ($value=~/\.$/) {
                        $value=$value . "  ";
                }
                if ($thistag ne $tag && $note ne "") {
                        $marcnote = {marcnote => $note,};
                        push @marcnotes, $marcnote;
                        $note=$value;
                        $tag=$thistag;
                }
                if ($note ne $value) {
                        $note = $note." ".$value;
                }
        }

        if ($note) {
                $marcnote = {marcnote => $note};
                push @marcnotes, $marcnote;   #load last tag into array
        }

        $sth->finish;
        $dbh->disconnect;

        my address@hidden;
        return $marcnotesarray;
}  # end getMARCnotes


sub getMARCsubjects {
    my ($dbh, $bibid, $marcflavour) = @_;
        my ($mintag, $maxtag);
        if ($marcflavour eq "MARC21") {
                $mintag = "600";
                $maxtag = "699";
        } else {           # assume unimarc if not marc21
                $mintag = "600";
                $maxtag = "619";
        }
        my $sth=$dbh->prepare("SELECT subfieldvalue,subfieldcode FROM 
marc_subfield_table WHERE bibid=? AND tag BETWEEN ? AND ? ORDER BY tagorder");

        $sth->execute($bibid,$mintag,$maxtag);

        my @marcsubjcts;
        my $subjct = "";
        my $subfield = "";
        my $marcsubjct;

        while (my $data=$sth->fetchrow_arrayref) {
                my $value = $data->[0];
                my $subfield = $data->[1];
                if ($subfield eq "a" && $value ne $subjct) {
                        $marcsubjct = {MARCSUBJCT => $value,};
                        push @marcsubjcts, $marcsubjct;
                        $subjct = $value;
                }
        }

        $sth->finish;
        $dbh->disconnect;

        my address@hidden;
        return $marcsubjctsarray;
}  #end getMARCsubjects

END { }       # module clean-up code here (global destructor)

1;
__END__

=back

=head1 AUTHOR

Koha Developement team <address@hidden>

=cut




reply via email to

[Prev in Thread] Current Thread [Next in Thread]