koha-cvs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Koha-cvs] koha/misc/migration_tools bulkmarcimport.pl


From: Thomas D
Subject: [Koha-cvs] koha/misc/migration_tools bulkmarcimport.pl
Date: Fri, 01 Sep 2006 17:11:53 +0000

CVSROOT:        /sources/koha
Module name:    koha
Changes by:     Thomas D <thd>  06/09/01 17:11:53

Modified files:
        misc/migration_tools: bulkmarcimport.pl 

Log message:
        For MARC 21, instead of deleting the  whole subfield when a character 
does not
        translate properly from MARC8 into UTF-8, only the problem characters 
are
        deleted.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/koha/misc/migration_tools/bulkmarcimport.pl?cvsroot=koha&r1=1.7&r2=1.8

Patches:
Index: bulkmarcimport.pl
===================================================================
RCS file: /sources/koha/koha/misc/migration_tools/bulkmarcimport.pl,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -b -r1.7 -r1.8
--- bulkmarcimport.pl   4 Jul 2006 15:06:35 -0000       1.7
+++ bulkmarcimport.pl   1 Sep 2006 17:11:53 -0000       1.8
@@ -12,25 +12,33 @@
 use MARC::Record;
 use MARC::Batch;
 use MARC::Charset;
+
+# According to kados, an undocumented feature of setting MARC::Charset to 
+# ignore_errors(1) is that errors are not ignored.  Instead of deleting the 
+# whole subfield when a character does not translate properly from MARC8 into 
+# UTF-8, just the problem characters are deleted.  This should solve at least 
+# some of the fixme problems for fMARC8ToUTF8().
+# 
+# Problems remain if there are MARC 21 records where 000/09 is set 
incorrectly. 
+# -- thd.
+MARC::Charset->ignore_errors(1);
+
 use C4::Context;
 use C4::Biblio;
 use Time::HiRes qw(gettimeofday);
 use Getopt::Long;
 binmode(STDOUT, ":utf8");
 
-use Getopt::Long;
-
 my ( $input_marc_file, $number) = ('',0);
-my ($version, $delete, $test_parameter,$char_encoding, $verbose, $commit);
+my ($version, $delete, $test_parameter,$marcFlavour, $verbose);
 
 GetOptions(
-       'commit:f'      => \$commit,
     'file:s'    => \$input_marc_file,
-    'n:f' => \$number,
+       'n' => \$number,
     'h' => \$version,
     'd' => \$delete,
     't' => \$test_parameter,
-    'c:s' => \$char_encoding,
+       'c:s' => \$marcFlavour,
     'v:s' => \$verbose,
 );
 
@@ -145,8 +153,7 @@
 \th : this version/help screen
 \tfile /path/to/file/to/dump : the file to dump
 \tv : verbose mode. 1 means "some infos", 2 means "MARC dumping"
-\tn : the number of records to import. If missing, all the file is imported
-\tcommit : the number of records to wait before performing a 'commit' operation
+\tn : the number of the record to import. If missing, all the file is imported
 \tt : test mode : parses the file, saying what he would do, but doing nothing.
 \tc : the characteristic MARC flavour. At the moment, only MARC21 and UNIMARC 
 \tsupported. MARC21 by default.
@@ -154,12 +161,10 @@
 \t\tbiblio, \t\tbiblioitems, \t\tsubjects,\titems
 \t\tadditionalauthors, \tbibliosubtitles, \tmarc_biblio,
 \t\tmarc_subfield_table, \tmarc_word, \t\tmarc_blob_subfield
-IMPORTANT : don't use this script before you've entered and checked your MARC 
parameters tables twice (or more!).
-Otherwise, the import won't work correctly and you will get invalid data.
+IMPORTANT : don't use this script before you've entered and checked twice (or 
more) your  MARC parameters tables.
+If you fail this, the import won't work correctly and you will get invalid 
datas.
 
-SAMPLE : 
-\t\$ export KOHA_CONF=/etc/koha.conf
-\t\$ perl misc/migration_tools/bulkmarcimport.pl -d -commit 1000 -file 
/home/jmf/koha.mrc -n 3000
+SAMPLE : ./bulkmarcimport.pl -file /home/paul/koha.dev/local/npl -n 1
 EOF
 ;#'
 die;
@@ -191,35 +196,50 @@
 $batch->warnings_off();
 $batch->strict_off();
 my $i=0;
-my $commitnum = 50;
-
-if ($commit) {
-
-$commitnum = $commit;
-
-}
-
 #1st of all, find item MARC tag.
 my ($tagfield,$tagsubfield) = 
&MARCfind_marc_from_kohafield($dbh,"items.itemnumber",'');
 # $dbh->do("lock tables biblio write, biblioitems write, items write, 
marc_biblio write, marc_subfield_table write, marc_blob_subfield write, 
marc_word write, marc_subfield_structure write, stopwords write");
 while ( my $record = $batch->next() ) {
-warn "I:".$i;
-warn "NUM:".$number;
        $i++;
-
-       if ($i==$number) {
-               z3950_extended_services('commit',set_service_options('commit'));
-               print "COMMIT OPERATION SUCCESSFUL\n";
-
-               my $timeneeded = gettimeofday - $starttime;
-               die "$i MARC records imported in $timeneeded seconds\n";
-       }
-       # perform the commit operation ever so often
-       if ($i==$commit) {
-               z3950_extended_services('commit',set_service_options('commit'));
-               $commit+=$commitnum;
-               print "COMMIT OPERATION SUCCESSFUL\n";
+#FIXME: it's kind of silly to go from MARC::Record to MARC::File::XML and 
+       # then back again just to fix the encoding
+       #
+       # It is even sillier when the conversion too frequently produces errors 
+       # instead of fixing the encoding.  Hence, the following MARC::File::XML 
+       # lines are now commented out until character set conversion in XML 
+       # works better. -- thd
+       ## my $uxml = $record->as_xml;
+       ## $record = MARC::Record::new_from_xml($uxml, 'UTF-8');
+       
+       # Check record encoding and convert encoding if necessary.
+       
+       if ($marcFlavour eq 'MARC21') {
+               my $tag000_pos09;
+               if ($record->encoding() eq 'UTF-8') {
+                       if ($verbose) {
+                               print "\nRecord $i encoding is UTF-8\n";
+                               $tag000_pos09 = substr ($record->leader, 9, 1);
+                               $tag000_pos09 =~ s/ /#/;
+                               print "\nUTF-8 LEADER/09: " . $tag000_pos09 
."\n";
+                       }
+               } elsif ($record->encoding() eq 'MARC-8') {
+                       print "\nConverting record $i encoding from MARC8 to 
UTF-8\n";
+                       # Convert MARC-8 to UTF-8
+                       $record = fMARC8ToUTF8($record, $verbose);
+                       if ($verbose) {
+                               print "\nRecord $i encoding has been converted 
to UTF-8\n";
+                               $tag000_pos09 = substr ($record->leader, 9, 1);
+                               $tag000_pos09 =~ s/ /#/;
+                               print "\nUTF-8 LEADER/09: " . $tag000_pos09 
."\n";
+                       }
+               }
+       } elsif ($marcFlavour eq 'UNIMARC') {
+               # I have not developed a UNIMARC character encoding conversion 
script 
+               # yet.  Common encodings should be easy.  Less comon and 
multiple 
+               # encodings will need extra work.  I am happy to work on this 
if there 
+               # is some interest. -- thd
        }
+       
        #now, parse the record, extract the item fields, and store them in 
somewhere else.
 
        ## create an empty record object to populate
@@ -245,9 +265,9 @@
 
        # go through each subfield code/data pair
        foreach my $pair ( $oldField->subfields() ) { 
-               #$pair->[1] =~ s/\<//g;
-               #$pair->[1] =~ s/\>//g;
-               push( @newSubfields, $pair->[0], $pair->[1] ); 
#char_decode($pair->[1],$char_encoding) );
+               $pair->[1] =~ s/\<//g;
+               $pair->[1] =~ s/\>//g;
+               push( @newSubfields, $pair->[0], 
char_decode($pair->[1],$marcFlavour) );
        }
 
        # add the new field to our new record
@@ -262,7 +282,10 @@
 
        }
 
+
+       if ($verbose) {
        warn "$i ==>".$newRecord->as_formatted() if $verbose eq 2;
+       }
        my @fields = $newRecord->field($tagfield);
        my @items;
        my $nbitems=0;
@@ -277,17 +300,13 @@
        print "$i : $nbitems items found\n" if $verbose;
        # now, create biblio and items with NEWnewXX call.
        unless ($test_parameter) {
-               my ($bibid,$oldbibitemnum) = NEWnewbiblio($dbh,$newRecord,'');
+               my ($bibid,$oldbibnum,$oldbibitemnum) = 
NEWnewbiblio($dbh,$newRecord,'');
                warn "ADDED biblio NB $bibid in DB\n" if $verbose;
                for (my $i=0;$i<=$#items;$i++) {
-                   warn "here is the biblioitemnumber $oldbibitemnum";
-                       NEWnewitem($dbh,$items[$i],$bibid,$oldbibitemnum);
+                       NEWnewitem($dbh,$items[$i],$bibid);
                }
        }
 }
-# final commit of the changes
-z3950_extended_services('commit',set_service_options('commit'));
-print "COMMIT OPERATION SUCCESSFUL\n";
-
+# $dbh->do("unlock tables");
 my $timeneeded = gettimeofday - $starttime;
-print "$i MARC records done in $timeneeded seconds\n";
+print "$i MARC record done in $timeneeded seconds";




reply via email to

[Prev in Thread] Current Thread [Next in Thread]