#-------------------------------------------------------------------------------
# MyHtmlParser
#
# Unterklasse von HTML::Parser. Um jenes Modul verwenden zu koennen, muss man
# die Klasse ableiten; dies wird hier gemacht.
#-------------------------------------------------------------------------------
package MyHtmlParser;
use HTML::Parser;
use Data::Dumper;
use strict;
use vars qw(@ISA);


@ISA = qw(HTML::Parser);


#-------------------------------------------------------------------------------
# new
#-------------------------------------------------------------------------------
sub new
{
 my ($class) = @_;
 my $data = HTML::Parser->new();

 $data->{title}  = "";
 $data->{words}  = {};
 $data->{links}  = {};
 $data->{cached} = "";

 return bless($data,$class);
}


#-------------------------------------------------------------------------------
# addToWords
#-------------------------------------------------------------------------------
sub addToWords
{
 my ($self,$aref) = @_;
 my $words = $self->{words};
 my $i;

 foreach $i (@$aref)
 {
  if (length($i) > 0)
     {
      $words->{lc($i)}++;
     }
 }
}


#-------------------------------------------------------------------------------
# stringToArray
#-------------------------------------------------------------------------------
sub stringToArray
{
 my ($self,$text) = @_;
 my $result = [];
 my @words;
 my $i;

 $text =~ s/&uuml;/ue/g;
 $text =~ s/&ouml;/oe/g;
 $text =~ s/&uuml;/ue/g;
 $text =~ s/&Auml;/Ae/g;
 $text =~ s/&Ouml;/Oe/g;
 $text =~ s/&Uuml;/Ue/g;
 $text =~ s/&szlig;/ss/g;
 $text =~ s/&nbsp;/ /g;
 $text =~ s/&quot;/ /g;
 $text =~ s/ä/ae/g;
 $text =~ s/ö/oe/g;
 $text =~ s/ü/ue/g;
 $text =~ s/Ä/Ae/g;
 $text =~ s/Ö/Oe/g;
 $text =~ s/Ü/Ue/g;
 $text =~ s/ß/ss/g;

 @words = split(/[^a-zA-Z0-9]+/,$text);

 foreach $i (@words)
 {
  if (length($i) > 0)
     {
      push(@$result,lc($i));
     }
 }

 return $result;
}


#-------------------------------------------------------------------------------
# formatText
#-------------------------------------------------------------------------------
sub formatText
{
 my ($self,$text) = @_;
 $text =~ s/\n/ /g;
 $text =~ s/^\s+//;
 $text =~ s/\s+$//;
 $text =~ s/\t/ /g;
 $text =~ s/\s+/ /g;
 return $text;
}



#-------------------------------------------------------------------------------
# text
#
# Ereignis, das jedesmal aufgerufen wird, wenn irgendwelcher Text erkannt wird.
#-------------------------------------------------------------------------------
sub text
{
 my ($self,$text) = @_;
 my $list = $self->stringToArray($text);

 $self->addToWords($list);

 if ($self->{titlemode})
    {
     $self->{title} .= $text;
    }

 $self->{cached} .= $text;
}


#-------------------------------------------------------------------------------
# start
#
# Event, das aufgerufen wird, wenn ein beginnender Tag erkannt wird.
# $tag ist der Tagname in Kleinbuchstaben, $attr ist das Attributhash.
#-------------------------------------------------------------------------------
sub start
{
 my ($self, $tag, $attr, $attrseq, $origtext) = @_;
 my $links = $self->{links};

 if ($tag eq "title")
    {
     $self->{titlemode} = 1;
    }
 elsif ($tag eq "a")
    {
     if (exists $attr->{href})
        {
         $links->{$attr->{href}}++;
        }
    }
 elsif ($tag eq "meta")
    {
     if ($attr->{name} eq "keywords")
        {
         $self->{meta_keywords} = $self->stringToArray($attr->{content});
        }

     if ($attr->{name} eq "description")
        {
         $self->addToWords($self->stringToArray($attr->{content}));
         $self->{meta_description} = $self->formatText($attr->{content});
        }

     if ($attr->{name} eq "robots")
        {
         $self->{meta_robots} = $self->formatText($attr->{content});
        }
    }
}



#-------------------------------------------------------------------------------
# end
#
# Event, das aufgerufen wird, wenn das Ende eines Tags erkannt wird.
#-------------------------------------------------------------------------------
sub end
{
 my ($self, $tag, $origtext) = @_;

 if ($tag eq "title")
    {
     $self->{titlemode} = 0;
    }
}




#-------------------------------------------------------------------------------
# analyze
#-------------------------------------------------------------------------------
sub analyze
{
 my ($self,$html) = @_;
 my $meta_description;
 my $meta_keywords;
 my $meta_robots;
 my $cached;
 my $title;
 my $words;
 my $links;

 
 $self->parse($html);


 $title            = $self->formatText($self->{title});
 $meta_keywords    = $self->{meta_keywords};
 $meta_keywords    = $meta_keywords ? $meta_keywords : [];
 $meta_description = $self->{meta_description};
 $meta_robots      = $self->{meta_robots};
 $cached           = $self->{cached};
 $words            = $self->{words};
 $links            = $self->{links};

 return ($title,$meta_keywords,$meta_description,$meta_robots,$words,$links,$cached);
}


1;


