(*----------------------------------------------------------------------------*)
(* Joachim Pimiskern, 2004. Freeware.                                         *)
(*----------------------------------------------------------------------------*)
unit martok;

interface
uses
   classes,
   jphash;


type
  TMarkupTokenType = (tt_eof,
                      tt_anytext,
                      tt_less,
                      tt_greater,
                      tt_equals,
                      tt_slash,
                      tt_string,
                      tt_identifier,
                      tt_space,
                      tt_comment
                     );

  TCharStream = class
    sl  : TStringList;
    posi: integer;
    len : integer;
    text: string;
    constructor Create(fn: string);
    destructor  Destroy; override;
    function    nextChar: integer;
  end;


  TMarkupTokenizer = class
  private
    FCharStream : TCharStream;
    FCurrentChar: integer;
    procedure    nextChar;
  public
    TokenType  : TMarkupTokenType;
    TokenString: string;
    TagContext : boolean;
    constructor  Create(CharStream: TCharStream);
    destructor   Destroy; override;
    function     TokenToString: string;
    procedure    nextToken;
  end;


  TTagTokenType = (ttt_tag,ttt_anytext,ttt_comment,ttt_eof);
  TTagToken = class
    TokenType  : TTagTokenType;
    TokenString: string;
    Data       : TJpHashTable;
    constructor  Create;
    destructor   Destroy; override;
    function     TokenToString(tt: TTagTokenType): string;
    function     GetKeyValuePairs: string;
    function     toString: string;
  end;

  TTagParser = class
  private
    FTokens: TList;
    FMarkupTokenizer: TMarkupTokenizer;
  public
    constructor  Create(mt: TMarkupTokenizer);
    destructor   Destroy; override;
    procedure    Parse;
  published
    property Tokens: TList read FTokens write FTokens;
  end;


implementation
uses
  sysutils;

constructor TCharStream.Create(fn: string);
begin
  inherited Create;
  sl := TStringList.Create;
  sl.LoadFromFile(fn);
  text := sl.Text;
  len  := length(text);
  posi  := 1;
end;


destructor TCharStream.Destroy;
begin
  sl.Free;
  inherited Destroy;
end;


function TCharStream.nextChar: integer;
begin
  if (posi <= len) then 
     begin
       result := ord(Text[posi]);
       inc(posi);
     end
  else
     result := -1;
end;


(*----------------------------------------------------------------------------*)

constructor TMarkupTokenizer.Create(CharStream: TCharStream);
begin
  inherited Create;
  TagContext   := true;
  FCharStream  := CharStream;
  nextChar;
  nextToken;
end;

destructor TMarkupTokenizer.Destroy;
begin
  inherited Destroy;
end;

function TMarkupTokenizer.TokenToString: string;
begin

  case TokenType of
    tt_eof:
      result := 'tt_eof';
    tt_anytext:
      result := 'tt_anytext';
    tt_less   :
      result := 'tt_less';
    tt_greater:
      result := 'tt_greater';
    tt_equals :
      result := 'tt_equals';
    tt_slash :
      result := 'tt_slash';
    tt_string :
      result := 'tt_string';
    tt_identifier:
      result := 'tt_identifier';
    tt_space  :
      result := 'tt_space';
    tt_comment:
      result := 'tt_comment';
  else
     result := '???';
  end;
end;


procedure TMarkupTokenizer.nextChar;
begin
  FCurrentChar := FCharStream.nextChar;
end;

function isSpace(c: char): boolean;
begin
  if (
      (c = chr( 9)) or
      (c = chr(10)) or
      (c = chr(13)) or
      (c = chr(32))
     ) then
     result := true
  else
     result := false;
end;

procedure TMarkupTokenizer.nextToken;
var state: integer;
begin
  TokenType   := tt_anytext;
  TokenString := '';

  state := 0;
  while (state <> 1000) do
  begin
    (*------------------------------------------------------------------------*)
    if (state = 0) then
    begin
      if (FCurrentChar < 0) then
         begin
           TokenType   := tt_eof;
           TokenString := 'EOF';
           state := 1000;
         end
      else
      if (chr(FCurrentChar) = '<') then
         begin
           TokenType   := tt_less;
           TokenString := '<';
           nextChar;
           state := 100;
         end
      else
      if (chr(FCurrentChar) = '>') then
         begin
           TokenType   := tt_greater;
           TokenString := '>';
           nextChar;
           state := 1000;
         end
      else
      if (chr(FCurrentChar) = '=') then
         begin
           TokenType   := tt_equals;
           TokenString := '=';
           nextChar;
           state := 1000;
         end
      else
      if (chr(FCurrentChar) = '/') then
         begin
           TokenType   := tt_slash;
           TokenString := '/';
           nextChar;
           state := 1000;
         end
      else
      if ((chr(FCurrentChar) = '"') and (TagContext = true)) then
         begin
           TokenType := tt_string;
           nextChar;
           while ((chr(FCurrentChar) <> '"') and (FCurrentChar >= 0)) do
           begin
             TokenString := TokenString + chr(FCurrentChar);
             nextChar;
           end;
           nextChar;
           state := 1000;
         end
      else
      if ((chr(FCurrentChar) = '''') and (TagContext = true)) then
         begin
           TokenType := tt_string;
           nextChar;
           while ((chr(FCurrentChar) <> '''') and (FCurrentChar >= 0)) do
           begin
             TokenString := TokenString + chr(FCurrentChar);
             nextChar;
           end;
           nextChar;
           state := 1000;
         end
      else
      if (isspace(chr(FCurrentChar))) then
         begin
           TokenType   := tt_space;
           while (isspace(chr(FCurrentChar))) do
           begin
             TokenString := TokenString + chr(FCurrentChar);
             nextChar;
           end;
           state := 1000;
         end
      else
      if (
          ((chr(FCurrentChar) >= 'a') and (chr(FCurrentChar) <= 'z')) or
          ((chr(FCurrentChar) >= 'A') and (chr(FCurrentChar) <= 'Z')) or
          (chr(FCurrentChar) = '_')
         ) then
         begin
           TokenType   := tt_identifier;
           while (
                  ((chr(FCurrentChar) >= '0') and (chr(FCurrentChar) <= '9')) or
                  ((chr(FCurrentChar) >= 'a') and (chr(FCurrentChar) <= 'z')) or
                  ((chr(FCurrentChar) >= 'A') and (chr(FCurrentChar) <= 'Z')) or
                  (chr(FCurrentChar) = '_')
                 ) do
           begin
             TokenString := TokenString + chr(FCurrentChar);
             nextChar;
           end;
           state := 1000;
         end
      else
         begin
           TokenString := TokenString + chr(FCurrentChar);
           nextChar;
           state := 0;
         end;
    end
    (*------------------------------------------------------------------------*)
    else
    if (state = 100) then
    begin
      if (chr(FCurrentChar) = '!') then
         begin
           TokenType   := tt_anytext;
           TokenString := TokenString + '!';
           nextChar;
           state := 110;
         end
      else
         State := 1000;
    end
    (*------------------------------------------------------------------------*)
    else
    if (state = 110) then
    begin
      if (chr(FCurrentChar) = '-') then
         begin
           TokenType   := tt_anytext;
           TokenString := TokenString + '-';
           nextChar;
           state := 120;
         end
      else
         State := 1000;
    end
    (*------------------------------------------------------------------------*)
    else
    if (state = 120) then
    begin
      if (chr(FCurrentChar) = '-') then
         begin
           TokenType   := tt_anytext;
           TokenString := TokenString + '-';
           nextChar;
           state := 130;
         end
      else
         State := 1000;
    end
    (*------------------------------------------------------------------------*)
    else
    if (state = 130) then
    begin
      if (isspace(chr(FCurrentChar)))  then
         begin
           TokenType   := tt_anytext;
           TokenString := TokenString + chr(FCurrentChar);
           nextChar;
           state := 140;
         end
      else
         State := 1000;
    end
    (*------------------------------------------------------------------------*)
    else
    if (state = 140) then
    begin
      if (FCurrentChar < 0) then
         state := 1000
      else
      if (chr(FCurrentChar) = '-') then
         begin
           TokenType   := tt_anytext;
           TokenString := TokenString + chr(FCurrentChar);
           nextChar;
           state := 150;
         end
      else
         begin
           TokenType   := tt_anytext;
           TokenString := TokenString + chr(FCurrentChar);
           nextChar;
           State := 140;
         end;
    end
    (*------------------------------------------------------------------------*)
    else
    if (state = 150) then
    begin
      if (FCurrentChar < 0) then
         state := 1000
      else
      if (chr(FCurrentChar) = '-') then
         begin
           TokenType   := tt_anytext;
           TokenString := TokenString + chr(FCurrentChar);
           nextChar;
           state := 160;
         end
      else
         begin
           TokenType   := tt_anytext;
           TokenString := TokenString + chr(FCurrentChar);
           nextChar;
           State := 140;
         end;
    end
    (*------------------------------------------------------------------------*)
    else
    if (state = 160) then
    begin
      if (FCurrentChar < 0) then
         state := 1000
      else
      if (chr(FCurrentChar) = '>') then
         begin
           TokenType   := tt_comment;
           TokenString := TokenString + chr(FCurrentChar);
           nextChar;
           state := 1000;
         end
      else
         begin
           TokenType   := tt_anytext;
           TokenString := TokenString + chr(FCurrentChar);
           nextChar;
           State := 140;
         end;
    end;
    (*------------------------------------------------------------------------*)
  end;
end;



(*----------------------------------------------------------------------------*)

constructor TTagToken.Create;
begin
  inherited Create;
  Data := TJpHashTable.Create;
end;

destructor TTagToken.Destroy;
begin
  Data.Free;
  inherited Destroy;
end;

function TTagToken.TokenToString(tt: TTagTokenType): string;
begin
  case tt of
    ttt_tag:
      result := 'ttt_tag';
    ttt_anytext:
      result := 'ttt_anytext';
    ttt_eof:
      result := 'ttt_eof';
    else
      result := '???';
  end;
end;

function TTagToken.GetKeyValuePairs: string;
var sl: TStringList;
    i : integer;
begin
  result := '';

  sl := TStringList.Create;
  try
    Data.Keys(sl);
    for i := 0 to sl.Count - 1 do
        begin
          result := result + sl[i] + '=' + Data.Get(sl[i]);
          if (i < sl.Count - 1) then
             result := result + ' ';
        end;
  finally
    sl.Free;
  end;
end;

function TTagToken.toString: string;
var tagname: string;
begin
  case TokenType of
    ttt_tag:
       begin
         if (Data.Exists('tagname')) then
            tagname := Data.Get('tagname')
         else
            tagname := '---tagname not found---';
         result := '<' +  tagname + ' ' + GetKeyValuePairs() + '>';
       end;
    ttt_anytext:
      result := TokenString;
    ttt_comment:
      result := 'Comment: ' + TokenString;
    ttt_eof:
      result := '???-ttt_eof';
    else
      result := '???';
  end;
end;


(*----------------------------------------------------------------------------*)

constructor TTagParser.Create(mt: TMarkupTokenizer);
begin
  inherited Create;
  FMarkupTokenizer := mt;
  FTokens := TList.Create;
  Parse;
end;

destructor TTagParser.Destroy;
var i: integer;
begin
  for i := 0 to FTokens.Count - 1 do
      TTagToken(FTokens[i]).Free;
  FTokens.Free;
  inherited Destroy;
end;

procedure TTagParser.Parse;
var state: integer;
    token: TTagToken;
    originalText: string;
    left : string;
    right: string;
begin
  while (FMarkupTokenizer.TokenType <> tt_eof) do
  begin
    if (FMarkupTokenizer.TokenType = tt_Comment) then
       begin
         token := TTagToken.Create;
         token.TokenType   := ttt_comment;
         token.TokenString := FMarkupTokenizer.TokenString;
         FTokens.Add(token);
         FMarkupTokenizer.nextToken;
       end
    else
    if (FMarkupTokenizer.TokenType = tt_Less) then
       begin
         FMarkupTokenizer.TagContext := true;
         FMarkupTokenizer.nextToken;
         token := TTagToken.Create;
         token.TokenType := ttt_tag;
         originalText := '<';
         left  := '';
         right := '';

         state := 0;
         while ((FMarkupTokenizer.TokenType <> tt_greater) and (FMarkupTokenizer.TokenType <> tt_eof)) do
         begin
           if (FMarkupTokenizer.TokenType = tt_string) then
              originalText := originalText + '"' + FMarkupTokenizer.TokenString + '"'
           else
              originalText := originalText + FMarkupTokenizer.TokenString;
           (*-----------------------------------------------------------------*)
           if (state = 0) then
              begin
                if (FMarkupTokenizer.TokenType = tt_space) then
                   begin
                     state := 0;
                   end
                else
                if (FMarkupTokenizer.TokenType = tt_identifier) then
                   begin
                     token.Data.Add('tagtype','begin');
                     token.Data.Add('tagname',lowercase(FMarkupTokenizer.TokenString));
                     state := 100;
                   end
                else
                if (FMarkupTokenizer.TokenType = tt_slash) then
                   begin
                     token.Data.Add('tagtype','end');
                     state := 200;
                   end
                else
                   begin
                     state := 1000;
                   end;
              end
           else
           (*-----------------------------------------------------------------*)
           if (state = 100) then
              begin
                if (FMarkupTokenizer.TokenType = tt_space) then
                   begin
                     state := 110;
                   end
                else
                   begin
                     state := 1000;
                   end;
              end
           else
           (*-----------------------------------------------------------------*)
           if (state = 110) then
              begin
                if (FMarkupTokenizer.TokenType = tt_space) then
                   begin
                     state := 110;
                   end
                else
                if (FMarkupTokenizer.TokenType = tt_identifier) then
                   begin
                     left := FMarkupTokenizer.TokenString;
                     state := 120;
                   end
                else
                   begin
                     state := 1000;
                   end;
              end
           else
           (*-----------------------------------------------------------------*)
           if (state = 120) then
              begin
                if (FMarkupTokenizer.TokenType = tt_space) then
                   begin
                     state := 120;
                   end
                else
                if (FMarkupTokenizer.TokenType = tt_equals) then
                   begin
                     state := 130;
                   end
                else
                   begin
                     state := 1000;
                   end
              end
           else
           (*-----------------------------------------------------------------*)
           if (state = 130) then
              begin
                if (FMarkupTokenizer.TokenType = tt_space) then
                   begin
                     state := 130;
                   end
                else
                if (FMarkupTokenizer.TokenType = tt_string) then
                   begin
                     right := FMarkupTokenizer.TokenString;
                     Token.Data.Add(lowercase(left),right);

                     left  := '';
                     right := '';
                     state := 100;
                   end
                else
                if (
                    (FMarkupTokenizer.TokenType = tt_anytext   ) or
                    (FMarkupTokenizer.TokenType = tt_slash     ) or
                    (FMarkupTokenizer.TokenType = tt_identifier)
                   ) then
                   begin
                     right := FMarkupTokenizer.TokenString;
                     state := 140;
                   end
                else
                   begin
                     state := 1000;
                   end;
              end
           else
           (*-----------------------------------------------------------------*)
           if (state = 140) then
              begin
                if (FMarkupTokenizer.TokenType = tt_space) then
                   begin
                     Token.Data.Add(lowercase(left),right);

                     left  := '';
                     right := '';
                     state := 110;
                   end
                else
                if (
                    (FMarkupTokenizer.TokenType = tt_anytext   ) or
                    (FMarkupTokenizer.TokenType = tt_slash     ) or
                    (FMarkupTokenizer.TokenType = tt_identifier)
                   ) then
                   begin
                     right := right + FMarkupTokenizer.TokenString;
                     state := 140;
                   end
                else
                   begin
                     state := 1000;
                   end;
              end
           else
           (*-----------------------------------------------------------------*)
           if (state = 200) then
              begin
                if (FMarkupTokenizer.TokenType = tt_identifier) then
                   begin
                     token.Data.Add('tagname',lowercase(FMarkupTokenizer.TokenString));
                     state := 100;
                   end
                else
                   begin
                     state := 1000;
                   end;
              end;

           FMarkupTokenizer.nextToken;
         end;

         if (left <> '') then
            begin
              Token.Data.Add(lowercase(left),right);
              left  := '';
              right := '';
            end;

         token.Data.Add('originaltext',originalText+'>');
         FTokens.Add(token);
         FMarkupTokenizer.TagContext := false;
         FMarkupTokenizer.nextToken;
       end
    else
       begin
         token := TTagToken.Create;
         token.TokenType   := ttt_anytext;
         token.TokenString := FMarkupTokenizer.TokenString;
         FTokens.Add(token);
         FMarkupTokenizer.nextToken;
       end;
  end;
end;



end.
