%PDF- %PDF- Mini Shell
Mini Shell

Direktori : /home/rs/perl/fml/lib/perl5/site_perl/5.8/Mail/SpamAssassin/Plugin/Tokenizer/
Current File : /home/rs/perl/fml/lib/perl5/site_perl/5.8/Mail/SpamAssassin/Plugin/Tokenizer/SimpleJA.pm
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

=head1 NAME

Tokenizer::SimpleJA - simple Japanese tokenizer

=head1 SYNOPSIS

loadplugin     Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA

=head1 DESCRIPTION

This plugin simply tokenizes Japanese strings by scripts.

=cut

package Mail::SpamAssassin::Plugin::Tokenizer::SimpleJA;

use strict;
use warnings;
use Mail::SpamAssassin::Plugin::Tokenizer;

use vars qw(@ISA);
@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);

our $language = 'ja';

sub new {
  my $class = shift;
  my $mailsaobject = shift;

  $class = ref($class) || $class;
  my $self = $class->SUPER::new($mailsaobject, $language);
  bless ($self, $class);

  return $self;
}

sub tokenize {
  my ($self, $text_array) = @_;

  my @tokenized_array;
  foreach my $text (@$text_array) {
    next unless ($text);
    utf8::encode($text) if utf8::is_utf8($text);
    $text =~ s/((?:[\xE0-\xEF][\x80-\xBF]{2}|[\xF0-\xF4][\x80-\xBF]{3})+)/&_tokenize($1)/eg;
    $text =~ s/ +/ /g;
    $text =~ s/^ //;
    $text =~ s/ $//;
    push(@tokenized_array, $text);
  }
  return \@tokenized_array;
}

sub _tokenize {
  my $text = shift;

  utf8::decode($text) unless utf8::is_utf8($text);

  # Note: Perl's Katakana block does not have KATAKANA-HIRAGANA PROLONGED SOUND MARK (U+30FC).
  # Katakana: U+30A0 - U+30FF
  # Hiragana: U+3041 - U+309F
  $text =~ s/([\N{U+30A0}-\N{U+30FF}]+           |  # Katakana
              [\N{U+3041}-\N{U+309F}\N{U+30FC}]+ |  # Hiragana
              \p{Han}+                           |  # Han
              \p{gc=P}+                             # General_Category: Punctuation
             )/ $1 /gx;

  # Skip Punctuation and Other.
  # See also: http://www.unicode.org/reports/tr44/#General_Category_Values
  $text =~ s/[\p{gc=P}\p{gc=C}]/ /g;

  # Skip Hiragana or Katakana less than two letters.
  $text =~ s/\b[\N{U+3041}-\N{U+30FF}]{1,2}\b//g;

  # Skip Variation Selector and Specials.
  #  FE00 -  FE0F: Variation Selectors
  #  FFF0 -  FFFF: Specials
  # E0100 - E01EF: Variation Selectors Supplement
  $text =~ s/[\N{U+FE00}-\N{U+FE0F}\N{U+FFF0}-\N{U+FFFF}\N{U+E0100}-\N{U+E01FE}]//g;

  utf8::encode($text);
  return $text;
}

1;
Zerion Mini Shell 1.0