koha-cvs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Koha-cvs] koha/misc/migration_tools bulkmarcimport.pl [rel_2_2]


From: Thomas D
Subject: [Koha-cvs] koha/misc/migration_tools bulkmarcimport.pl [rel_2_2]
Date: Wed, 26 Apr 2006 07:36:12 +0000

CVSROOT:        /sources/koha
Module name:    koha
Branch:         rel_2_2
Changes by:     Thomas D <address@hidden>       06/04/26 07:36:12

Modified files:
        misc/migration_tools: bulkmarcimport.pl 

Log message:
        MARC8 to UTF-8 support added without XML problems.  Rationalised the 
variable
        name used for designating MARC flavour to avoid confusion with 
character set
        encoding.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/koha/koha/misc/migration_tools/bulkmarcimport.pl.diff?only_with_tag=rel_2_2&tr1=1.1.2.2&tr2=1.1.2.3&r1=text&r2=text

Patches:
Index: koha/misc/migration_tools/bulkmarcimport.pl
diff -u koha/misc/migration_tools/bulkmarcimport.pl:1.1.2.2 
koha/misc/migration_tools/bulkmarcimport.pl:1.1.2.3
--- koha/misc/migration_tools/bulkmarcimport.pl:1.1.2.2 Fri Mar 10 04:11:23 2006
+++ koha/misc/migration_tools/bulkmarcimport.pl Wed Apr 26 07:36:12 2006
@@ -2,28 +2,139 @@
 # small script that import an iso2709 file into koha 2.0
 
 use strict;
+use warnings;
 
 # Koha modules used
 use MARC::File::USMARC;
+# Uncomment the line below and use MARC::File::XML again when it works better.
+# -- thd
+# use MARC::File::XML;
 use MARC::Record;
 use MARC::Batch;
+use MARC::Charset;
 use C4::Context;
 use C4::Biblio;
 use Time::HiRes qw(gettimeofday);
-
 use Getopt::Long;
+binmode(STDOUT, ":utf8");
+
 my ( $input_marc_file, $number) = ('',0);
-my ($version, $delete, $test_parameter,$char_encoding, $verbose);
+my ($version, $delete, $test_parameter,$marcFlavour, $verbose);
+
 GetOptions(
-    'file:s'    => \$input_marc_file,
-    'n' => \$number,
-    'h' => \$version,
-    'd' => \$delete,
-    't' => \$test_parameter,
-    'c:s' => \$char_encoding,
-    'v:s' => \$verbose,
+       'file:s'    => \$input_marc_file,
+       'n' => \$number,
+       'h' => \$version,
+       'd' => \$delete,
+       't' => \$test_parameter,
+       'c:s' => \$marcFlavour,
+       'v:s' => \$verbose,
 );
 
+# FIXME:  Management of error conditions needed for record parsing problems
+# and MARC8 character sets with mappings to Unicode not yet included in 
+# MARC::Charset.  The real world rarity of these problems is not fully tested.
+# Unmapped character sets will throw a warning currently and processing will 
+# continue with the error condition.  A fairly trivial correction should 
+# address some record parsing and unmapped character set problems but I need 
+# time to implement a test and correction for undef subfields and revert to 
+# MARC8 if mappings are missing. -- thd
+sub fMARC8ToUTF8($$) {
+       my ($record) = shift;
+       my ($verbose) = shift;
+       if ($verbose) {
+               if ($verbose >= 2) {
+                       my $leader = $record->leader();
+                       $leader =~ s/ /#/g;
+                       print "\n000 " . $leader;
+               }
+       }
+       foreach my $field ($record->fields()) {
+               if ($field->is_control_field()) {
+                       if ($verbose) {
+                               if ($verbose >= 2) {
+                                       my $fieldName = $field->tag();
+                                       my $fieldValue = $field->data();
+                                       $fieldValue =~ s/ /#/g;
+                                       print "\n" . $fieldName;
+                                       print ' ' . $fieldValue;
+                               }
+                       }
+               } else {
+                       my @subfieldsArray;
+                       my $fieldName = $field->tag();
+                       my $indicator1Value = $field->indicator(1);
+                       my $indicator2Value = $field->indicator(2);
+                       if ($verbose) {
+                               if ($verbose >= 2) {
+                                       $indicator1Value =~ s/ /#/;
+                                       $indicator2Value =~ s/ /#/;
+                                       print "\n" . $fieldName . ' ' . 
+                                                       $indicator1Value . 
+                                       $indicator2Value;
+                               }
+                       }
+                       foreach my $subfield ($field->subfields()) {
+                               my $subfieldName = $subfield->[0];
+                               my $subfieldValue = $subfield->[1];
+                               $subfieldValue = 
MARC::Charset::marc8_to_utf8($subfieldValue);
+                               
+                               # Alas, MARC::Field::update() does not work 
correctly.
+                               ## push (@subfieldsArray, $subfieldName, 
$subfieldValue);
+                               
+                               push @subfieldsArray, [$subfieldName, 
$subfieldValue];
+                               if ($verbose) {
+                                       if ($verbose >= 2) {
+                                               print " \$" . $subfieldName . ' 
' . $subfieldValue;
+                                       }
+                               }
+                       }
+                       
+                       # Alas, MARC::Field::update() does not work correctly.
+                       # 
+                       # The first instance in the field of a of a repeated 
subfield 
+                       # overwrites the content from later instances with the 
content 
+                       # from the first instance.
+                       ## $field->update(@subfieldsArray);
+                       
+                       foreach my $subfieldRow(@subfieldsArray) {
+                               my $subfieldName = $subfieldRow->[0];
+                               $field->delete_subfields($subfieldName);
+                       }
+                       foreach my $subfieldRow(@subfieldsArray) {
+                               $field->add_subfields(@$subfieldRow);
+                       }
+                       
+                       if ($verbose) {
+                               if ($verbose >= 2) {
+                                       # Reading the indicator values again is 
not necessary.  
+                                       # They were not converted.
+                                       # $indicator1Value = 
$field->indicator(1);
+                                       # $indicator2Value = 
$field->indicator(2);
+                                       # $indicator1Value =~ s/ /#/;
+                                       # $indicator2Value =~ s/ /#/;
+                                       print "\nCONVERTED TO UTF-8:\n" . 
$fieldName . ' ' . 
+                                                       $indicator1Value . 
+                                       $indicator2Value;
+                                       foreach my $subfield 
($field->subfields()) {
+                                               my $subfieldName = 
$subfield->[0];
+                                               my $subfieldValue = 
$subfield->[1];
+                                               print " \$" . $subfieldName . ' 
' . $subfieldValue;
+                                       }
+                               }
+                       }
+                       if ($verbose) {
+                               if ($verbose >= 2) {
+                                       print "\n" if $verbose;
+                               }
+                       }
+               }
+       }
+       $record->encoding('UTF-8');
+       return $record;
+}
+
+
 if ($version || ($input_marc_file eq '')) {
        print <<EOF
 small script to import an iso2709 file into Koha.
@@ -33,8 +144,9 @@
 \tv : verbose mode. 1 means "some infos", 2 means "MARC dumping"
 \tn : the number of the record to import. If missing, all the file is imported
 \tt : test mode : parses the file, saying what he would do, but doing nothing.
-\tc : the char encoding. At the moment, only MARC21 and UNIMARC supported. 
MARC21 by default.
-\d : delete EVERYTHING related to biblio in koha-DB before import  :tables :
+\tc : the characteristic MARC flavour. At the moment, only MARC21 and UNIMARC 
+\tsupported. MARC21 by default.
+\td : delete EVERYTHING related to biblio in koha-DB before import  :tables :
 \t\tbiblio, \t\tbiblioitems, \t\tsubjects,\titems
 \t\tadditionalauthors, \tbibliosubtitles, \tmarc_biblio,
 \t\tmarc_subfield_table, \tmarc_word, \t\tmarc_blob_subfield
@@ -66,8 +178,8 @@
        print "TESTING MODE ONLY\n    DOING NOTHING\n===============\n";
 }
 
-$char_encoding = 'MARC21' unless ($char_encoding);
-print "CHAR : $char_encoding\n" if $verbose;
+$marcFlavour = 'MARC21' unless ($marcFlavour);
+print "Characteristic MARC flavour: $marcFlavour\n" if $verbose;
 my $starttime = gettimeofday;
 my $batch = MARC::Batch->new( 'USMARC', $input_marc_file );
 $batch->warnings_off();
@@ -77,23 +189,64 @@
 my ($tagfield,$tagsubfield) = 
&MARCfind_marc_from_kohafield($dbh,"items.itemnumber",'');
 # $dbh->do("lock tables biblio write, biblioitems write, items write, 
marc_biblio write, marc_subfield_table write, marc_blob_subfield write, 
marc_word write, marc_subfield_structure write, stopwords write");
 while ( my $record = $batch->next() ) {
-    #FIXME: it's kind of silly to go from MARC::Record to MARC::File::XML and 
then back again just to fix the encoding
-    my $uxml = $record->as_xml;
-    $record = MARC::Record::new_from_xml($uxml, 'UTF-8');
        $i++;
+#FIXME: it's kind of silly to go from MARC::Record to MARC::File::XML and 
+       # then back again just to fix the encoding
+       #
+       # It is even sillier when the conversion too frequently produces errors 
+       # instead of fixing the encoding.  Hence, the following MARC::File::XML 
+       # lines are now commented out until character set conversion in XML 
+       # works better. -- thd
+       ## my $uxml = $record->as_xml;
+       ## $record = MARC::Record::new_from_xml($uxml, 'UTF-8');
+       
+       # Check record encoding and convert encoding if necessary.
+       
+       if ($marcFlavour eq 'MARC21') {
+               my $tag000_pos09;
+               if ($record->encoding() eq 'UTF-8') {
+                       if ($verbose) {
+                               print "\nRecord $i encoding is UTF-8\n";
+                               $tag000_pos09 = substr ($record->leader, 9, 1);
+                               $tag000_pos09 =~ s/ /#/;
+                               print "\nUTF-8 LEADER/09: " . $tag000_pos09 
."\n";
+                       }
+               } elsif ($record->encoding() eq 'MARC-8') {
+                       print "\nConverting record $i encoding from MARC8 to 
UTF-8\n";
+                       # Convert MARC-8 to UTF-8
+                       $record = fMARC8ToUTF8($record, $verbose);
+                       if ($verbose) {
+                               print "\nRecord $i encoding has been converted 
to UTF-8\n";
+                               $tag000_pos09 = substr ($record->leader, 9, 1);
+                               $tag000_pos09 =~ s/ /#/;
+                               print "\nUTF-8 LEADER/09: " . $tag000_pos09 
."\n";
+                       }
+               }
+       } elsif ($marcFlavour eq 'UNIMARC') {
+               # I have not developed a UNIMARC character encoding conversion 
script 
+               # yet.  Common encodings should be easy.  Less comon and 
multiple 
+               # encodings will need extra work.  I am happy to work on this 
if there 
+               # is some interest. -- thd
+       }
+       
        #now, parse the record, extract the item fields, and store them in 
somewhere else.
 
-    ## create an empty record object to populate
-    my $newRecord = MARC::Record->new();
+       ## create an empty record object to populate
+       my $newRecord = MARC::Record->new();
        $newRecord->leader($record->leader());
 
-    # go through each field in the existing record
-    foreach my $oldField ( $record->fields() ) {
+       # go through each field in the existing record
+       foreach my $oldField ( $record->fields() ) {
 
        # just reproduce tags < 010 in our new record
-       if ( $oldField->tag() < 10 ) {
-           $newRecord->append_fields( $oldField );
-           next();
+       # 
+       # Fields are not necessarily only numeric in the actual world of 
records 
+       # nor in what I would recommend for additonal safe non-interfering local
+       # use fields.  The following regular expression match is much safer 
than 
+       # a numeric evaluation. -- thd
+       if ( $oldField->tag() =~ m/^00/ ) {
+               $newRecord->append_fields( $oldField );
+               next();
        }
 
        # store our new subfield data in this list
@@ -103,23 +256,25 @@
        foreach my $pair ( $oldField->subfields() ) { 
                $pair->[1] =~ s/\<//g;
                $pair->[1] =~ s/\>//g;
-               push( @newSubfields, $pair->[0], 
char_decode($pair->[1],$char_encoding) );
+               push( @newSubfields, $pair->[0], 
char_decode($pair->[1],$marcFlavour) );
        }
 
        # add the new field to our new record
        my $newField = MARC::Field->new(
-           $oldField->tag(),
-           $oldField->indicator(1),
-           $oldField->indicator(2),
-           @newSubfields
+               $oldField->tag(),
+               $oldField->indicator(1),
+               $oldField->indicator(2),
+               @newSubfields
        );
 
        $newRecord->append_fields( $newField );
 
-    }
+       }
 
 
-       warn "$i ==>".$newRecord->as_formatted() if $verbose eq 2;
+       if ($verbose) {
+#              warn "$i ==>".$newRecord->as_formatted() if $verbose eq 2;
+       }
        my @fields = $newRecord->field($tagfield);
        my @items;
        my $nbitems=0;




reply via email to

[Prev in Thread] Current Thread [Next in Thread]