%PDF- %PDF- Mini Shell
Mini Shell

Direktori : /home/rs/perl/5.8/lib/perl5/site_perl/5.8/Mail/SpamAssassin/Plugin/Tokenizer/
Current File : /home/rs/perl/5.8/lib/perl5/site_perl/5.8/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

=head1 NAME

Tokenizer::MeCab - Japanese tokenizer with MeCab

=head1 SYNOPSIS

loadplugin     Mail::SpamAssassin::Plugin::Tokenizer::MeCab

=head1 DESCRIPTION

This plugin tokenizes Japanese strings with MeCab which is 
the morphological analysis engine. 

MeCab perl module 0.996 or higher is required.

=cut

package Mail::SpamAssassin::Plugin::Tokenizer::MeCab;

use strict;
use warnings;
use Mail::SpamAssassin::Plugin::Tokenizer;

use vars qw(@ISA);
@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);

# Have to do this so that RPM doesn't find these as required perl modules
BEGIN { require MeCab; }
our $language = 'ja';
our $mecab = new MeCab::Tagger(-Ochasen);

sub new {
  my $class = shift;
  my $mailsaobject = shift;

  $class = ref($class) || $class;
  my $self = $class->SUPER::new($mailsaobject, $language);
  bless ($self, $class);

  return $self;
}

sub tokenize {
  my ($self, $text_array) = @_;

  my @tokenized_array;
  foreach my $text (@$text_array) {
    next unless ($text);
    utf8::encode($text) if utf8::is_utf8($text);
    $text =~ s/((?:[\xE0-\xEF][\x80-\xBF]{2}|[\xF0-\xF4][\x80-\xBF]{3})+)/&_tokenize($1)/eg;
    $text =~ s/ +/ /g;
    $text =~ s/^ //;
    $text =~ s/ $//;
    push(@tokenized_array, $text);
  }
  return \@tokenized_array;
}

# stop words
# Source: mecab-ipadic-2.7.0-20070801: pos-id.def
# その他,間投,*,* 0
# フィラー,*,*,* 1
# 記号,括弧開,*,* 5
# 記号,括弧閉,*,* 6
# 記号,句点,*,* 7
# 記号,空白,*,* 8
# 記号,読点,*,* 9
# 助詞,格助詞,一般,* 13
# 助詞,格助詞,引用,* 14
# 助詞,格助詞,連語,* 15
# 助詞,係助詞,*,* 16
# 助詞,終助詞,*,* 17
# 助詞,接続助詞,*,* 18
# 助詞,特殊,*,* 19
# 助詞,副詞化,*,* 20
# 助詞,副助詞,*,* 21
# 助詞,副助詞／並立助詞／終助詞,*,* 22
# 助詞,並立助詞,*,* 23
# 助詞,連体化,*,* 24
# 助動詞,*,*,* 25
# 接続詞,*,*,* 26
# 名詞,代名詞,一般,* 59
# 名詞,非自立,一般,* 63
# 名詞,非自立,形容動詞語幹,* 64
# 名詞,非自立,助動詞語幹,* 65
# 名詞,非自立,副詞可能,* 66
our @stop_words_id_list = (
    0, 1, 5, 6, 7, 8, 9,
    13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 
    25, 26, 59, 63, 64, 65, 66);

sub _tokenize {
  my $text = shift;

  my @tokens = (' ');
  for (my $node = $mecab->parseToNode($text); $node->{next}; $node = $node->{next}) {
    # Skip if stop words.
    next if (grep {$_ eq $node->{posid}} @stop_words_id_list);

    # Skip if one Hiragana or Katakana.
    next if ($node->{surface} =~ /^\xE3[\x81-\x83][\x80-\xBF]$/);

    push(@tokens, $node->{surface});
  }
  push(@tokens, ' ');
  return join(' ', @tokens);
}

1;
Zerion Mini Shell 1.0