%PDF- %PDF-
Direktori : /home/rs/perl/fml/lib/perl5/site_perl/5.8/Mail/SpamAssassin/Plugin/Tokenizer/ |
Current File : /home/rs/perl/fml/lib/perl5/site_perl/5.8/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm |
# <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # </@LICENSE> =head1 NAME Tokenizer::SimpleJA - simple Japanese tokenizer =head1 SYNOPSIS loadplugin Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA =head1 DESCRIPTION This plugin simply tokenizes Japanese strings by scripts. =cut package Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA; use strict; use warnings; use Mail::SpamAssassin::Plugin::Tokenizer; use vars qw(@ISA); @ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer); our $language = 'ja'; sub new { my $class = shift; my $mailsaobject = shift; $class = ref($class) || $class; my $self = $class->SUPER::new($mailsaobject, $language); bless ($self, $class); return $self; } sub tokenize { my ($self, $text_array) = @_; my @tokenized_array; foreach my $text (@$text_array) { next unless ($text); utf8::encode($text) if utf8::is_utf8($text); $text =~ s/((?:[\xE0-\xEF][\x80-\xBF]{2}|[\xF0-\xF4][\x80-\xBF]{3})+)/&_tokenize($1)/eg; $text =~ s/ +/ /g; $text =~ s/^ //; $text =~ s/ $//; push(@tokenized_array, $text); } return \@tokenized_array; } sub _tokenize { my $text = shift; utf8::decode($text) unless utf8::is_utf8($text); # Note: Perl's Katakana block does not have KATAKANA-HIRAGANA PROLONGED SOUND MARK (U+30FC). # Katakana: U+30A0 - U+30FF # Hiragana: U+3041 - U+309F $text =~ s/([\N{U+30A0}-\N{U+30FF}]+ | # Katakana [\N{U+3041}-\N{U+309F}\N{U+30FC}]+ | # Hiragana \p{Han}+ | # Han \p{gc=P}+ # General_Category: Punctuation )/ $1 /gx; # Skip Punctuation and Other. # See also: http://www.unicode.org/reports/tr44/#General_Category_Values $text =~ s/[\p{gc=P}\p{gc=C}]/ /g; # Skip Hiragana or Katakana less than two letters. $text =~ s/\b[\N{U+3041}-\N{U+30FF}]{1,2}\b//g; # Skip Variation Selector and Specials. # FE00 - FE0F: Variation Selectors # FFF0 - FFFF: Specials # E0100 - E01EF: Variation Selectors Supplement $text =~ s/[\N{U+FE00}-\N{U+FE0F}\N{U+FFF0}-\N{U+FFFF}\N{U+E0100}-\N{U+E01FE}]//g; utf8::encode($text); return $text; } 1;