Namazu-devel-ja(旧)


[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: multipart内のファイルのインデックス化



臼田です

Yukio USUDA wrote:
> macbinary.plで使った方法を用いて
> filter/mailnews.plに手を加えてmailファイルのmultipart部分の
> インデックス化を試みています。

手を加えてInternetExplorerがWebアーカイブとして保存する
mhtmlファイルにも対応させました。
HEADとのdiffを添付しています。

multipart部分のデコードには
Perl5.8に標準で入っているMIME::Base64とMIME::QuoatedPrint
のモジュールを使っています。
MIME::Base64がなければbase64部分は無視しています。

multipartの部分はなんでもかんでもデコードして
他のfilterに渡すようになるのですが、
・画像等はデコードせずに無視するほうがよい?
・base64のデコードを抑制するオプションをつけたほうがよい?
・標準ではbase64は無視し、デコードしたい場合はオプションで
 指示するほうがよい?
というところで気になっています。
ご意見をください。

臼田幸生
Index: namazu/filter/mailnews.pl
===================================================================
RCS file: /storage/cvsroot/namazu/filter/mailnews.pl,v
retrieving revision 1.28
diff -u -r1.28 mailnews.pl
--- namazu/filter/mailnews.pl	7 Oct 2003 05:48:56 -0000	1.28
+++ namazu/filter/mailnews.pl	29 Jan 2004 14:02:12 -0000
@@ -28,11 +28,15 @@
 require 'util.pl';
 require 'gfilter.pl';
 
+my $has_base64 = undef;
+
 sub mediatype() {
     return ('message/rfc822', 'message/news');
 }
 
 sub status() {
+    $has_base64 = 1 if (util::checklib('MIME/Base64.pm') &&
+			util::checklib('MIME/QuotedPrint.pm'));
     return 'yes';
 }
 
@@ -111,15 +115,15 @@
 
 	    my $weight = $conf::Weight{'html'}->{'title'};
 	    $$weighted_str .= "\x7f$weight\x7f$line\x7f/$weight\x7f\n";
- 	} elsif ($line =~ s/^content-type:\s*//i) {
-	    if ($line =~ /multipart.*boundary="(.*)"/i){
+	} elsif ($line =~ s/^content-type:\s*//i) {
+	    if ($line =~ /multipart.*boundary="(.*?)"/i){
 		$boundary = $1;
 		util::dprint("((boundary: $boundary))\n");
-  	    } elsif ($line =~ m!message/partial;\s*(.*)!i) {
+	    } elsif ($line =~ m!message/partial;\s*(.*)!i) {
 		# The Message/Partial subtype routine [1998-10-12]
 		# contributed by Hiroshi Kato <tumibito@xxxxxxxxxxxxxxxxxxx>
-  		$partial = $1;
-  		util::dprint("((partial: $partial))\n");
+		$partial = $1;
+		util::dprint("((partial: $partial))\n");
             } elsif ($line !~ m!text/plain!i) {
                 $$contref = '';
                 return;
@@ -153,21 +157,60 @@
 	$boundary =~ s/(\W)/\\$1/g;
 	$$contref =~ s/This is multipart message.\n//i;
 
+	multipart_process($contref, $boundary, $weighted_str, $fields);
+
+    }
+}
+
+sub multipart_process ($$$$){
+    my ($contref, $boundary, $weighted_str, $fields) = @_;
+
+    # MIME multipart processing,
+    # modified by Furukawa-san's patch on [1998/08/27]
+    $$contref =~ s/--$boundary(--)?\n?/\xff/g;
+    my (@parts) = split(/\xff/, $$contref);
+    $$contref = '';
+    for $_ (@parts){
+	if (s/^(.*?\n\n)//s){
+	    my ($head) = $1; 
+	    my ($body) .= $_;
+	    my $contenttype = "";
+	    my $cont_encode = "";
+	    if ($head =~ m!^content-type:\s*(\S+?);?\s!mi){
+		$contenttype = lc($1);
+		util::dprint("((Content-Type: $contenttype))\n");
+	    }
+
+	    if ($head =~ m!^content-transfer-encoding:\s*(\S+)$!mi){
+		$cont_encode = lc($1);
+		util::dprint("((Content-Transfer-Encode: $cont_encode))\n");
+	    }
 
-	# MIME multipart processing,
-	# modified by Furukawa-san's patch on [1998/08/27]
- 	$$contref =~ s/--$boundary(--)?\n?/\xff/g;
- 	my (@parts) = split(/\xff/, $$contref);
- 	$$contref = '';
- 	for $_ (@parts){
- 	    if (s/^(.*?\n\n)//s){
- 		my ($head) = $1;
- 		$$contref .= $_ if $head =~ m!^content-type:.*text/plain!mi;
- 	    }
- 	}
+	    if ($cont_encode =~ m/base64/){
+		base64_filter(\$body);
+	    } elsif ($cont_encode =~ m/quoted-printable/){
+		quotedprint_filter(\$body);
+	    } 
+
+	    if ($contenttype =~ m!text/plain!){
+		$$contref .= $body;
+	    } elsif ($contenttype =~ m!multipart/alternative!){
+		if ($head =~ /boundary="(.*?)"/i){
+		    my $boundary2 = $1;
+		    util::dprint("((boundary: $boundary2))\n");
+		    $boundary2 =~ s/(\W)/\\$1/g;
+		    multipart_process(\$body, $boundary2, $weighted_str, $fields);
+		    $$contref .= $body;
+		}
+	    } else {
+		nesting_filter(\$head, \$body, $contenttype, $weighted_str);
+		$$contref .= $body;
+	    }
+	}
     }
 }
 
+
 # Make mail/news citation marks not to be indexed.
 # And a greeting message at the beginning.
 # And a meaningless message such as "foo wrote:".
@@ -297,5 +340,56 @@
     }
 }
 
+sub base64_filter ($){
+    my ($bodyref) = @_;
+    if ($has_base64) {
+	eval 'use MIME::Base64 ();';
+	$$bodyref = MIME::Base64::decode($$bodyref);
+    } else {
+	$$bodyref="";
+    }
+}
+
+sub quotedprint_filter ($){
+    my ($bodyref) = @_;
+    if ($has_base64) {
+	eval 'use MIME::QuotedPrint ();';
+	$$bodyref = MIME::QuotedPrint::decode_qp($$bodyref);
+    } else {
+	$$bodyref="";
+    }
+}
+
+sub nesting_filter ($$$$){
+    my ($headref, $bodyref, $mmtype, $weighted_str) = @_;
+    my $err = undef;
+    my $dummy_shelterfname="";
+    my $headings = "";
+    my %fields;
+    my $filename = "";
+    if ($$headref =~ m!^content-disposition:\s*\S+\s*filename="(.+)"!mi){
+	$filename = $1;
+    } elsif ($$headref =~ m!^content-location:\s*(\S+)!mi){
+	$filename = $1;
+    }
+    util::dprint("((Attached filename: $filename))\n");
+
+    if ($mmtype =~ m!application/octet-stream!){
+	$mmtype = undef
+    }
+    my ($kanji, $mtype) = mknmz::apply_filter(\$filename, $bodyref, 
+			$weighted_str, \$headings, \%fields, 
+			$dummy_shelterfname, $mmtype);
+    if (($mtype =~ /; x-system=unsupported$/) ||
+	($mtype =~ /; x-error=.*$/)){
+	$$bodyref = "";
+	$err = "filter/mailnews.pl gets error from other filter";
+	util::dprint($err);
+    }else{
+	gfilter::show_filter_debug_info($bodyref, $weighted_str,
+					\%fields, \$headings);
+    }
+    return $err;
+}
 
 1;