将HTML转换为TXT -- function HtmlToTxt(const HTMLText:string;MarkLinks:boolean):string;

                            以下是我写的一个实用函数,实现HTML到TXT格式的转换。
你可能会说,IE里提供了另存为txt类型文件的功能。不过说老实话,IE的存为txt功能做得很烂的。
以下过程经过了几年的无数的测试,稳定性相当好。输出的结果也较美观。

function HtmlToTxt(const HTMLText:string;MarkLinks:boolean):string;
const
  CR=#13#10;
var
  NextToken,s0:string;
  i:integer;
  HelpIdx:integer;
  inQuot:boolean;        // 去除<script>段之用
  InputLen:integer;
  InputIdx:integer;      // 指向输入字符的下一个待处理字符
  inPre:boolean;         // 表示是否在<pre>...</pre>段内
  CurrLink:string;

  function MakeStr(C: Char; N: Integer): string;
  begin
    if N < 1 then Result := ''
    else begin
  {$IFNDEF WIN32}
      if N > 255 then N := 255;
  {$ENDIF WIN32}
      SetLength(Result, N);
      FillChar(Result[1], Length(Result), C);
    end;
  end;

  function NPos(const C: string; S: string; N: Integer): Integer;
  var
    I, P, K: Integer;
  begin
    Result := 0;
    K := 0;
    for I := 1 to N do begin
      P := Pos(C, S);
      Inc(K, P);
      if (I = N) and (P > 0) then begin
        Result := K;
        Exit;
      end;
      if P > 0 then Delete(S, 1, P)
      else Exit;
    end;
  end;

  function ReplaceStr(const S, Srch, Replace: string): string;
  var
    I: Integer;
    Source: string;
  begin
    Source := S;
    Result := '';
    repeat
      I := Pos(Srch, Source);
      if I > 0 then begin
        Result := Result + Copy(Source, 1, I - 1) + Replace;
        Source := Copy(Source, I + Length(Srch), MaxInt);
      end
      else Result := Result + Source;
    until I <= 0;
  end;

  function UnixToDos(const s:string):string;
  begin
    result:=AdjustLineBreaks(s);
  end;

  // 取得下一段字符串
  function GetNextToken(const s:string; const StartIdx:integer):string;
  var
    i:integer;
  begin
    if StartIdx>length(s) then
    begin
      result:='';
      exit;
    end;
    result:=s[StartIdx];
    if result='&' then
    begin
      for i:=StartIdx+1 to length(s) do
      begin
        if s[i] in ['&',' ',#13,'<'] then break;
        result:=result+s[i];
        if s[i]=';' then break;
      end;
    end
    else if result='<' then
    begin
      for i:=StartIdx+1 to length(s) do
      begin
        result:=result+s[i];
        if s[i]='>' then break;
      end;
    end
    else
    begin
      for i:=StartIdx+1 to length(s) do
        if s[i] in ['&','<'] then break
        else result:=result+s[i];
    end;
  end;
 
  // 输入:<a href="http://anjo.delphibbs.com">
  // 输出:http://anjo.delphibbs.com
  function GetLink(s:string):string;
  var
    LPos,RPos,LQuot,RQuot:integer;
  begin
     result:='';

    // 去掉'....<'
    LPos:=pos('<',s);
    if LPos=0 then exit;
    delete(s,1,LPos);
    s:=Trim(s);

    // 去掉'>....'
    RPos:=pos('>',s);
    if RPos=0 then exit;
    delete(s,RPos,MaxInt);

    if uppercase(copy(s,1,2))='A ' then
    begin
      LPos:=pos('HREF',uppercase(s));
      if LPos=0 then exit;

      LQuot:=NPos('"',s,1);
      RQuot:=NPos('"',s,2);

      if (LQuot<LPos) or (RQuot>RPos) then exit;

      // 开头带'#'的超链接,视为无效
      if s[LQuot+1]='#' then exit;

      // 开头带'javascript:'的超链接,也视为无效
      // 如:<div align=right><a href="javascript:window.close()"><IMG SRC="button_close.gif"></a></div>
      if copy(s,LQuot+1,11)='javascript:' then exit;

      result:=copy(s,LQuot+1,RQuot-LQuot-1);
    end;
  end;

  // 把所有&xxx的转义;所有<xxx>取消;其它照样返回
  function ConvertHTMLToken(const s:string;var inPre:boolean):string;
  var
    s0,s0_2,s0_3,s0_4:string;
  begin
    if s='' then
    begin
      result:='';
      exit;
    end;
    if s[1]='&' then
    begin
      s0:=lowerCase(s);
      result:='';
      if s0='&nbsp;' then result:=' '
      else if s0='&quot;' then result:='"'
      else if s0='&gt;' then result:='>'
      else if s0='&lt;' then result:='<'
      else if s0='&middot;' then result:='·'
      else if s0='&trade;' then result:=' TM '
      else if s0='&copy;' then result:='(c)'
      else if s0='&reg;' then result:='(R)'
      else if s0='&amp' then result:='&';
    end
    else if s[1]='<' then
    begin
      s0:=lowerCase(s);
      s0_2:=copy(s0,1,2);
      s0_3:=copy(s0,1,3);
      s0_4:=copy(s0,1,4);
 
      result:='';
      // 将所有<hr>替换成为'------'
      if s0='<br>' then result:=CR
      else if s0_4='<pre' then   // <pre 一定要在 <p 之前判断!
           begin inPre:=true;result:=CR; end
      else if s0_2='<p' then result:=CR+CR
      else if s0_3='<hr' then result:=CR+MakeStr('-',40)+CR
      else if s0_3='<ol' then result:=CR
      else if s0_3='<ul' then result:=CR
      else if s0_3='<li' then result:='·'
      else if s0_4='</li' then result:=CR
      else if s0_4='</tr' then result:=CR
      else if s0='</td>' then result:=#9
      else if s0='<title>' then result:='《'
      else if s0='</title>' then result:='》'+CR+CR
      else if s0='</pre>' then inPre:=false
      else if copy(s0,1,6)='<table' then result:=CR
      else if MarkLinks and (s0[2]='a') then
           begin
             CurrLink:=GetLink(s);
             if CurrLink<>'' then result:='[';
           end
      else if MarkLinks and (s0='</a>') then
             if CurrLink<>'' then result:=format(' %s ]',[CurrLink]);
    end
    else if inPre then
      result:=s
    else // 不在<pre>..</pre>内,则删除所有CR
      result:=ReplaceStr(s,CR,'');
  end;

begin
  s0:=UnixToDos(HTMLText);
  result:='';
  InputLen:=length(s0);
  InputIdx:=1;
  inPre:=false;
  CurrLink:='';

  while InputIdx<=InputLen do
  begin
    NextToken:=GetNextToken(s0,InputIdx);

    // 去除<style ...> -- </style>之间的内容
    if lowercase(copy(NextToken,1,6))='<style' then
    begin
      while lowercase(NextToken)<>'</style>' do
      begin
        inc(InputIdx,length(NextToken));
        NextToken:=GetNextToken(s0,InputIdx);
      end;
      inc(InputIdx,length(NextToken));
      NextToken:=GetNextToken(s0,InputIdx);
    end;

    // 去除<Script ...> -- </Script>之间的内容
    if lowercase(copy(NextToken,1,7))='<script' then
    begin
      inc(InputIdx,length(NextToken));
      inQuot:=false;
      i:=InputIdx-1;
      while I<InputLen do
      begin
        inc(i);
        if s0[i]='"' then
        begin
          inQuot:=not inQuot;
          continue;
        end;
        if not inQuot then
          // 去除<script>段里的<!-- ... -->注释段, 99.8.2
          if copy(s0,i,4)='<!--' then
          begin
            HelpIdx:=pos('-->',copy(s0,i+4,MaxInt));
            if HelpIdx>0 then
            begin
              inc(i,4+HelpIdx+2);
            end
            else
            begin
              i:=InputLen;
              break;
            end;
          end;
          if lowercase(copy(s0,i,9))='</script>' then
          begin
            break;
          end;
      end;
      InputIdx:=i;
    end;

    NextToken:=GetNextToken(s0,InputIdx);
    inc(InputIdx,length(NextToken));
    result:=result+ConvertHTMLToken(NextToken,inPre);
  end;
end;

                                

查看回复