学院首页>网络编程>ASP.NET>提取HTML代码中文字的C#函数

提取HTML代码中文字的C#函数

作者: 来源: 添加时间:2006-5-25 20:13:45
 

/// <summary>
  /// 去除HTML标记
  /// </summary>
  /// <param name="strHtml">包括HTML的源码 </param>
  /// <returns>已经去除后的文字</returns>
  public static string StripHTML(string strHtml)
  {
   string [] aryReg ={
@"<script[^>]*?>.*?</script>",

@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
@"([\r\n])[\s]+",
@"&(quot|#34);",
@"&(amp|#38);",
@"&(lt|#60);",
@"&(gt|#62);",
@"&(nbsp|#160);",
@"&(iexcl|#161);",
@"&(cent|#162);",
@"&(pound|#163);",
@"&(copy|#169);",
@"&#(\d+);",
@"-->",
@"<!--.*\n"
  
   };

string [] aryRep = {
  "",
  "",
  "",
  "\"",
  "&",
  "<",
  ">",
  " ",
  "\xa1",//chr(161),
  "\xa2",//chr(162),
  "\xa3",//chr(163),
  "\xa9",//chr(169),
  "",
  "\r\n",
  ""
};

string newReg =aryReg[0];
   string strOutput=strHtml;
   for(int i = 0;i<aryReg.Length;i++)
   {
Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase );
strOutput = regex.Replace(strOutput,aryRep[i]);
   }

strOutput.Replace("<","");
   strOutput.Replace(">","");
   strOutput.Replace("\r\n","");

return strOutput;
  }


站内搜索