[Namazu-devel-ja 1741] Re: filter/ooo.pl を Archive::Zip に対応するパッチ

Tadamasa Teranishi yw3t-trns @ asahi-net.or.jp
2008年 5月 3日 (土) 03:23:02 JST


寺西です。

Tadamasa Teranishi wrote:
> 
> とりあえず filter/ooo.pl を Archive::Zip に対応するパッチを書きま
> した。(development-2-1)

これの改良版です。(development-2-1)

前回のものに加えて、IO::String が利用できる環境ではテンポラリファイル
を作成しない処理も加えました。

koffice.pl にも使えます。
msofficexml.pl, xps.pl については、更にzip ファイル内のメンバを検索
する機能を追加する必要はありますが、Archive::Zip に対応することは
可能だと思います。
-- 
=====================================================================
寺西 忠勝(TADAMASA TERANISHI)  yw3t-trns @ asahi-net.or.jp
http://www.asahi-net.or.jp/~yw3t-trns/index.htm
Key fingerprint =  474E 4D93 8E97 11F6 662D  8A42 17F5 52F4 10E7 D14E
-------------- next part --------------
Index: ooo.pl
===================================================================
RCS file: /storage/cvsroot/namazu/filter/ooo.pl,v
retrieving revision 1.16.2.3
diff -u -p -r1.16.2.3 ooo.pl
--- ooo.pl	29 Apr 2008 11:20:58 -0000	1.16.2.3
+++ ooo.pl	2 May 2008 18:16:18 -0000
@@ -32,6 +32,7 @@ require 'gfilter.pl';
 my $utfconvpath = undef;
 my $unzippath = undef;
 my @unzipopts;
+my $_zread = undef;
 
 sub mediatype() {
     # http://framework.openoffice.org/documentation/mimetypes/mimetypes.html
@@ -46,8 +47,22 @@ sub mediatype() {
 }
 
 sub status() {
+    if (util::checklib('Compress/Zlib.pm') and
+    util::checklib('Archive/Zip.pm')) {
+        if (util::checklib('IO/String.pm')) {
+            $_zread = \&_az_is_zread;
+        } else {
+            $_zread = \&_az_zread;
+        }
+        if ($English::PERL_VERSION >= 5.008) {
+            $utfconvpath = "none";
+            return 'yes';
+        }
+    }
+
     $unzippath = util::checkcmd('unzip');
-    if (defined $unzippath){
+    if (defined $unzippath) {
+        $_zread = \&_unzip_zread;
         @unzipopts = ("-p");
         if (util::islang("ja")) {
            if ($English::PERL_VERSION >= 5.008) {
@@ -55,17 +70,17 @@ sub status() {
                return 'yes';
            }
            $utfconvpath = util::checkcmd('lv');
-           if ($utfconvpath){ 
+           if ($utfconvpath) {
                return 'yes';
            } else {
                $utfconvpath = util::checklib('unicode.pl');
-               if ($utfconvpath){ 
+               if ($utfconvpath) {
                    return 'yes';
                }
            }
-           return 'no'; 
+           return 'no';
         } else {
-           return 'yes'; 
+           return 'yes';
         }
     }
     return 'no';
@@ -109,25 +124,10 @@ sub filter ($$$$$) {
 
 sub filter_metafile ($$$) {
     my ($contref, $weighted_str, $fields) = @_;
-    my $metafile = 'meta.xml';
+
     my $xml = "";
-    my $tmpfile  = util::tmpnam('NMZ.zip');
-    { 
-        my $fh = util::efopen("> $tmpfile");
-        print $fh $$contref;
-        util::fclose($fh);
-    }
-    my @cmd = ($unzippath, @unzipopts, $tmpfile, $metafile);
-    my $status = util::syscmd(
-        command => \@cmd,
-        option => {
-            "stdout" => \$xml,
-            "stderr" => "/dev/null",
-            "mode_stdout" => "wt",
-            "mode_stderr" => "wt",
-        },
-    );
-    unlink $tmpfile;
+    my $status = $_zread->($contref, 'meta.xml', \$xml);
+    codeconv::normalize_nl(\$xml);
 
     my $authorname = ooo::get_author(\$xml);
     my $title = ooo::get_title(\$xml);
@@ -145,10 +145,10 @@ sub filter_metafile ($$$) {
         codeconv::normalize_eucjp(\$title);
         codeconv::normalize_eucjp(\$keywords);
     }
-    if (!($authorname eq "")){
+    if (!($authorname eq "")) {
         $fields->{'author'} = $authorname;
     }
-    if (!($title eq "")){
+    if (!($title eq "")) {
         $fields->{'title'} = $title;
         my $weight = $conf::Weight{'html'}->{'title'};
         $$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n";
@@ -162,25 +162,10 @@ sub filter_metafile ($$$) {
 
 sub filter_contentfile ($$$$$) {
     my ($contref, $weighted_str, $headings, $fields) = @_;
-    my $contentfile = "content.xml";
+
     my $xml = "";
-    my $tmpfile  = util::tmpnam('NMZ.zip');
-    { 
-        my $fh = util::efopen("> $tmpfile");
-        print $fh $$contref;
-        util::fclose($fh);
-    }
-    my @cmd = ($unzippath, @unzipopts, $tmpfile, $contentfile);
-    my $status = util::syscmd(
-        command => \@cmd,
-        option => {
-            "stdout" => \$xml,
-            "stderr" => "/dev/null",
-            "mode_stdout" => "wt",
-            "mode_stderr" => "wt",
-        },
-    );
-    unlink $tmpfile;
+    my $status = $_zread->($contref, 'content.xml', \$xml);
+    codeconv::normalize_nl(\$xml);
 
     ooo::remove_all_tag(\$xml);
     ooo::decode_entity(\$xml);
@@ -209,7 +194,7 @@ sub get_author ($){
 
 sub get_title ($){
   my ($contref) = @_;
-  if ($$contref =~ m!<dc:title>(.*)</dc:title>!){
+  if ($$contref =~ m!<dc:title>(.*)</dc:title>!) {
       return $1;
   } else {
       return "";
@@ -247,7 +232,7 @@ sub utoe ($) {
         my $cmd = ($utfconvpath . " -Iu8 " . "-Oej " . $tmpfile . " |");
         $$tmp = "";
         my $fh = util::efopen($cmd);
-        while (defined(my $line = <$fh>)){
+        while (defined(my $line = <$fh>)) {
             $$tmp .= $line;
         }
         util::fclose($fh);
@@ -286,4 +271,120 @@ sub decode_entity ($) {
     $$text =~ s/&nbsp[;\s]/ /g;
 }
 
+sub _unzip_zread($$$)
+{
+    my ($contref, $filename, $conts) = @_;
+
+    util::vprint("Processing ooo file ... (using  '$unzippath')\n");
+
+    my $tmpfile  = util::tmpnam('NMZ.zip');
+    {
+        my $fh = util::efopen("> $tmpfile");
+        print $fh $$contref;
+        util::fclose($fh);
+    }
+
+    my @cmd = ($unzippath, @unzipopts, $tmpfile, $filename);
+    my $status = util::syscmd(
+        command => \@cmd,
+        option => {
+            "stdout" => $conts,
+            "stderr" => "/dev/null",
+            "mode_stdout" => "wb",
+            "mode_stderr" => "wt",
+        },
+    );
+
+    unlink $tmpfile;
+
+    return $status;
+}
+
+use constant NMZ_AZ_OK           => 0;
+use constant NMZ_AZ_STREAM_END   => 1;
+use constant NMZ_AZ_ERROR        => 2;
+use constant NMZ_AZ_FORMAT_ERROR => 3;
+use constant NMZ_AZ_IO_ERROR     => 4;
+
+sub _az_zread($$$)
+{
+    eval 'use Archive::Zip;';
+    my ($contref, $filename, $conts) = @_;
+
+    util::vprint("Processing ooo file ... (using  'Archive::Zip')\n");
+
+    my $tmpfile  = util::tmpnam('NMZ.zip');
+    {
+        my $fh = util::efopen("> $tmpfile");
+        print $fh $$contref;
+        util::fclose($fh);
+    }
+
+    $$conts = '';
+
+    my $zip = Archive::Zip->new();
+    my $err = $zip->read($tmpfile);
+    if ($err != NMZ_AZ_OK) {
+        util::dprint("Archive::Zip: there was a error");
+        unlink $tmpfile;
+        return 1;
+    }
+
+    my $member = $zip->memberNamed($filename);
+    my $size = $member->uncompressedSize();
+    if ($size == 0) {
+        my $fname = $member->fileName();
+        util::dprint("$fname: filesize is 0");
+        unlink $tmpfile;
+        return 1;
+    } elsif ($size > $conf::FILE_SIZE_MAX) {
+        my $fname = $member->fileName();
+        util::dprint("$fname: Too large ooo file");
+        unlink $tmpfile;
+        return 1;
+    }
+
+    $$conts = $zip->contents($member);
+
+    unlink $tmpfile;
+
+    return 0;
+}
+
+sub _az_is_zread($$$)
+{
+    eval 'use Archive::Zip;';
+    eval 'use IO::String;';
+    my ($contref, $filename, $conts) = @_;
+
+    util::vprint("Processing ooo file ... (using  'Archive::Zip, IO::String')\n");
+
+    my $io = IO::String->new($$contref);
+
+    $$conts = '';
+
+    my $zip = Archive::Zip->new();
+    my $err = $zip->readFromFileHandle($io);
+    if ($err != NMZ_AZ_OK) {
+        util::dprint("Archive::Zip: there was a error");
+        return 1;
+    }
+
+    my $member = $zip->memberNamed($filename);
+    my $size = $member->uncompressedSize();
+    if ($size == 0) {
+        my $fname = $member->fileName();
+        util::dprint("$fname: filesize is 0");
+        return 1;
+    } elsif ($size > $conf::FILE_SIZE_MAX) {
+        my $fname = $member->fileName();
+        util::dprint("$fname: Too large ooo file");
+        return 1;
+    }
+
+    $$conts = $zip->contents($member);
+
+    return 0;
+}
+
 1;


Namazu-devel-ja メーリングリストの案内