namazu-ml(ring)


[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Word97の検索



はじめまして、倉部といいます。

 Namazuを使って、Word97の検索をしようと、簡単なヘルパープログラムを作成
しました。
 Win32, Active Perl, Word97が入っていないと動きません。
 他に、どんな方法を取られているのでしょうか?

 過去の履歴を見ると、  MSWordView, MSWord 8 converter for unix がある
ようですが、まだ試していません。

P.S. ReadMSWord.plの中で、Word文書が持っているProperty(Title, Subject,
Author, etc)の値に、
   Subjecct:: Date: 等のフィールド名を付けて、インデックスの作成をさ
せています。

------------- 以下 私のサンプル --------------
mknmzを変更したところ
my $TARGET_FILE = '.*\.html?|.*\.txt|.*_default|.*\.doc|.*\.rtf'; #doc
とrtfを追加
my %HELPER_PROGRAMS = (
    'gz'  => 'zcat',
    'Z'   => 'zcat',
    'man' => 'groff -man -Tnippon',
    'doc' => 'perl.exe d:/usr/local/namazu/bin/ReadMSWord.pl', #doc を
追加
    'rtf' => 'perl.exe d:/usr/local/namazu/bin/ReadMSWord.pl',  #rtfを
追加
);

 ReadMSWord.pl の内容

# Created by Jun Kurabe
# 1999/10/30
use Win32::OLE;
use Win32::OLE::Enum;

package ReadMSWord;

sub ReadMSWord {
    my $fileName = shift;

    my $word;
# use existing instance if Word is already running
    eval {$word = Win32::OLE->GetActiveObject('Word.Application')};
    die "MSWord not installed" if $@;
    unless (defined $word) {
 $word = Win32::OLE->new('Word.Application', sub {$_[0]->Quit;})
     or die "Oops, cannot start Word";
    }
    # for debug
#    $word->{Visible} = 1;
    my $doc = $word->{Documents}->open($fileName);
    $allText = '';
    $allText .= getProperties($doc);
    $allText .= getParagraphs($doc);
    $allText .= getShapes($doc);
    $allText .= getHeadersFooters($doc);
    $doc->close(0);
    undef $doc;
    undef $word;

    return $allText;
}

sub getParagraphs {
    my $doc = shift;
    my $e = Win32::OLE::Enum->new($doc->Paragraphs);
    my $allText = '';
    while(($obj = $e->Next)) {
 $p = $obj->Range->{Text};
 chop $p;
 $allText .= $p;
 $allText .= "\n";
    }

    return $allText;
}

sub getProperties {
    my $doc = shift;
    my $allText = '';

    # get Title
    $title = $doc->BuiltInDocumentProperties(1)->{Value};
    $subject = $doc->BuiltInDocumentProperties(2)->{Value};
    $author = $doc->BuiltInDocumentProperties(3)->{Value};
    $lastAuthor = $doc->BuiltInDocumentProperties(7)->{Value};
    $createDate = $doc->BuiltInDocumentProperties(11)->{Value};
    $editDate = $doc->BuiltInDocumentProperties(13)->{Value};

    $allText .= 'Subject: ' . $title . ' ' . $subjext ;
    $allText .= "\n";
    $allText .= 'From: ' . $author . ',' . $lastAuthor;
    $allText .= "\n";
    $allText .= 'Date: ' . $createDate;
    $allText .= "\n";
    $allText .= "\n";

    return $allText;
}

sub getShapes {
    my $doc = shift;
    my $e = Win32::OLE::Enum->new($doc->Shapes);
    my $allText = '';
    while(($obj = $e->Next)) {
 if ($obj->{Type} == 17 ) { # msoShapeTextBox = 17
     $p = $obj->TextFrame->TextRange->{Text};
     chop $p;
     $allText .= $p;
     $allText .= "\n";
 }
    }

    return $allText;
}

sub getHeadersFooters {
    my $doc = shift;

    my $allText = '';
    my $obj ;

    my $e = Win32::OLE::Enum->new($doc->Sections);
    while(($obj = $e->Next)) {
 my $e_header = Win32::OLE::Enum->new($obj->Headers);
 my $h;
 while(($h = $e_header->Next)) {
     $p = $h->Range->{Text};
     chop $p;
     $allText .= $p;
     $allText .= "\n";
 }

 my $e_footer = Win32::OLE::Enum->new($obj->Footers);
 my $f;
 while(($f = $e_footer->Next)) {
     $p = $f->Range->{Text};
     chop $p;
     $allText .= $p;
     $allText .= "\n";
 }
    }

    return $allText;
}

#main
print ReadMSWord::ReadMSWord("$ARGV[0]");