|
发表于 2016-2-26 16:23:12
|
显示全部楼层
7 n& T. y9 _: d- d缺少一个gethtml,用下面这个:
+ O* a" F8 b+ a& t5 [5 E; a, n- public string getHtml(string url, string charSet, bool UseUTF8CharSet)//url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
* n% L4 Y5 h; U& {5 D- j9 e) X - {
+ s' w: t5 n; k - string strWebData = "error";# D* e4 U0 W' c. P3 ~ q) ^
- try; O8 k$ ~& I5 z) U9 D
- {9 _9 L+ Z3 a, |3 K( v4 m: z2 i
- WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
+ b% {) {' o8 \+ u+ d2 a+ _) }& K - // 需要注意的:
" {7 w1 K( ^- E) h; [+ |. x' G, e4 C - //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 " C( M* _8 @* R- a( C1 l$ A7 |6 ^
- //这是就要具体问题具体分析比如在头部加入cookie " C5 t2 q7 f2 ~& X9 W% H1 i Z8 G
- // webclient.Headers.Add("Cookie", cookie);
( h5 [$ \+ q% X* y! K) [( A7 X) G - //这样可能需要一些重载方法。根据需要写就可以了
: t0 ^% e4 V0 K( a+ G3 ] - myWebClient.Headers.Add("User-agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)");
; H l2 C! f3 N; o" M. S, e7 F - //myWebClient.Headers.Add("User-agent", "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");" s D( z% D2 j# O& D5 f: f9 n3 R
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 # H4 O" R0 U. o4 |5 Y- w, ?
- myWebClient.Credentials = CredentialCache.DefaultCredentials;
1 m1 ^; k* M' h - //如果服务器要验证用户名,密码 ! E& \7 L8 b) I- @. ?
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword); ) A$ }6 U7 { ]5 u* ], K
- //myWebClient.Credentials = mycred;
' \8 [ b5 {) j8 x; O2 p - //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
( A" Y9 }9 g. `- } - byte[] myDataBuffer = myWebClient.DownloadData(url);, F9 H, U& Y- K/ k, T# m
- strWebData = Encoding.Default.GetString(myDataBuffer);
4 D( Q6 K# ^; d- S. E& _ - 9 L3 w% R8 `$ k4 ?+ \6 y
- //获取网页字符编码描述信息 : g6 n+ k, M5 S
- Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
2 {4 M( J8 m. N( J - string webCharSet = charSetMatch.Groups[2].Value;
/ f) ]% |( S5 r% T/ Q8 T - if (charSet == null || charSet == "")
/ r5 n7 d0 {) ?' L7 Z - charSet = webCharSet;
8 v& U, F1 D- w/ M - if (charSet.Length > 0)
8 k$ c' w$ p, j# x; s - {
; l4 ?: A! k) p% z, w( z$ Y - charSet = charSet.Replace(""", "");
3 E5 J2 ^, o% w8 I% D9 ]% Z* S - }# T e9 J! y& o0 X
- if (UseUTF8CharSet)
. _3 D3 S4 w h; p' }& H7 `% T - {5 a/ d5 W7 y% U* w& U
- if (charSet == null || charSet.Length == 0)
5 g5 j$ J2 F6 z1 }. F) ~0 `( R - {8 o3 [$ Q' [9 `& Q0 c
- charSet = "utf-8"; G$ N3 |7 Y2 ?% G3 _+ |
- }
+ `" n- d& p* u% { - }
) |; F! v. ]7 ^1 Y* Z2 { - if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)- x# _; c5 o: K N" x
- strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);- j5 ?5 Q; A/ {1 R7 A& D) G" ^
- % P3 f/ d6 U" z% X- |
- }" w! ~% g: t- ~" y4 z
- catch (Exception)
4 O ], u9 |& J* e - {: ]8 q- T/ C Q' V5 L- d4 z6 R
- strWebData = "error";! I& d, T) B0 H$ d
- }
! K$ [0 Z0 ]' N! T* Y8 l - N- {' U# j; i
- return strWebData;7 U# f( ?6 ~/ j
- }
复制代码 1 n* ]: B# \2 I' t: j1 E
% l; O/ @* F5 `. B: o# P, b
|
|