%PDF- %PDF-
Direktori : /home/rs/perl/default/lib/perl5/site_perl/5.32/Mail/SpamAssassin/Plugin/Tokenizer/ |
Current File : /home/rs/perl/default/lib/perl5/site_perl/5.32/Mail/SpamAssassin/Plugin/Tokenizer/MeCab.pm |
# <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # </@LICENSE> =head1 NAME Tokenizer::MeCab - Japanese tokenizer with MeCab =head1 SYNOPSIS loadplugin Mail::SpamAssassin::Plugin::Tokenizer::MeCab =head1 DESCRIPTION This plugin tokenizes Japanese strings with MeCab which is the morphological analysis engine. MeCab perl module 0.996 or higher is required. =cut package Mail::SpamAssassin::Plugin::Tokenizer::MeCab; use strict; use warnings; use Mail::SpamAssassin::Plugin::Tokenizer; use vars qw(@ISA); @ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); # Have to do this so that RPM doesn't find these as required perl modules BEGIN { require MeCab; } our $language = 'ja'; our $mecab = new MeCab::Tagger(-Ochasen); sub new { my $class = shift; my $mailsaobject = shift; $class = ref($class) || $class; my $self = $class->SUPER::new($mailsaobject, $language); bless ($self, $class); return $self; } sub tokenize { my ($self, $text_array) = @_; my @tokenized_array; foreach my $text (@$text_array) { next unless ($text); utf8::encode($text) if utf8::is_utf8($text); $text =~ s/((?:[\xE0-\xEF][\x80-\xBF]{2}|[\xF0-\xF4][\x80-\xBF]{3})+)/&_tokenize($1)/eg; $text =~ s/ +/ /g; $text =~ s/^ //; $text =~ s/ $//; push(@tokenized_array, $text); } return \@tokenized_array; } # stop words # Source: mecab-ipadic-2.7.0-20070801: pos-id.def # その他,間投,*,* 0 # フィラー,*,*,* 1 # 記号,括弧開,*,* 5 # 記号,括弧閉,*,* 6 # 記号,句点,*,* 7 # 記号,空白,*,* 8 # 記号,読点,*,* 9 # 助詞,格助詞,一般,* 13 # 助詞,格助詞,引用,* 14 # 助詞,格助詞,連語,* 15 # 助詞,係助詞,*,* 16 # 助詞,終助詞,*,* 17 # 助詞,接続助詞,*,* 18 # 助詞,特殊,*,* 19 # 助詞,副詞化,*,* 20 # 助詞,副助詞,*,* 21 # 助詞,副助詞/並立助詞/終助詞,*,* 22 # 助詞,並立助詞,*,* 23 # 助詞,連体化,*,* 24 # 助動詞,*,*,* 25 # 接続詞,*,*,* 26 # 名詞,代名詞,一般,* 59 # 名詞,非自立,一般,* 63 # 名詞,非自立,形容動詞語幹,* 64 # 名詞,非自立,助動詞語幹,* 65 # 名詞,非自立,副詞可能,* 66 our @stop_words_id_list = ( 0, 1, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 59, 63, 64, 65, 66); sub _tokenize { my $text = shift; my @tokens = (' '); for (my $node = $mecab->parseToNode($text); $node->{next}; $node = $node->{next}) { # Skip if stop words. next if (grep {$_ eq $node->{posid}} @stop_words_id_list); # Skip if one Hiragana or Katakana. next if ($node->{surface} =~ /^\xE3[\x81-\x83][\x80-\xBF]$/); push(@tokens, $node->{surface}); } push(@tokens, ' '); return join(' ', @tokens); } 1;