|
|
发表于 2016-2-26 16:17:11
|
显示全部楼层
- #region 抓取全站链接* N1 F1 ?. \- Q8 Q( q) i/ N
- public static List<string> GetAllHref(string url)% C8 s" D. }. A( J2 w
- {& j7 O0 L* ]: O- ] I
- List<string> allHref = new List<string>();
% ~, f8 E; f( J& F' } - try8 \' M: h* Q* Q4 t8 R
- {
! X. d, s9 Q+ c - string strhtml = soso.getHtml(url, "", true);; i, C5 W! `! @7 y' k( }, a
- if (strhtml != "error")
7 Z+ u9 }" ^9 Y: s6 a( A - {
/ b+ n$ d0 s- k8 Y3 E) b7 O$ u5 l - Regex reg = new Regex(@"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>");1 M. e* z, m" O" Q4 W9 A
- MatchCollection mc = reg.Matches(strhtml);
% k; t; t6 b/ k$ N; ?9 v$ H - foreach (Match m in mc)
9 w6 N1 S# [, `3 A. o3 b) d5 P4 w - {
# C6 z2 s9 R) I0 K - Uri uri = new Uri(url);
% X% H# ^8 Q9 U5 X - Uri thisUri = new Uri(uri, m.Groups["url"].Value);- w8 o- Q" @" B
- string fullUrl = "";; O8 ]/ B/ C: t; G& l% I
- if (m.Groups["url"].Value.StartsWith("http"))
/ B$ F8 ^. l# }+ a - {
x1 t0 A9 _( I& [ - fullUrl = m.Groups["url"].Value;& T: H0 r- g7 o# F; Y
- }, X+ U; u" f& o5 p
- else/ ` z4 v$ S( I3 L
- {2 W5 b, O5 Z( I6 _- G1 v
- fullUrl = thisUri.ToString();
r9 j- F) H9 P, X; M8 b/ | - }
) i$ [# X. A; m: ^/ d+ A - allHref.Add(fullUrl);5 Z# i4 Y! M; w) t. q
- //Console.WriteLine("原链接:" + m.Groups["url"].Value);" T+ C! D4 S6 I4 R: s( l, ?
- //Console.WriteLine("文本标记:" + m.Groups["text"].Value);# Z. U; P5 d1 ~& _
- //Console.WriteLine("补全链接:" + fullUrl);
) \, D; u! _; F& m - //Console.WriteLine("…………………………………………");5 R6 R6 R. Y/ K4 c- g+ }3 w7 h
- }
8 n4 z! l a3 L0 A S: d - }9 f0 p& s; u2 Q$ J7 d
- }/ k) o4 b1 p" W
- catch (Exception ex)& c3 `' ^5 O2 h! W$ C
- { }6 U6 M2 }; Y5 }6 Y* |1 f5 [
- return allHref;7 B( k9 P" x, r! l* q! y
- }3 h: `+ Y m0 r( x2 E7 e: R+ f$ {8 f
- #endregion
复制代码 7 \5 k, h4 N+ O6 ]5 k( Q0 X
% K# a; p, f' Q
: Z) z! m) l- K7 F% G4 x- - L$ }$ Z7 P( Z- u: Q. Y1 r) C
- u4 J1 Z' y4 C* O- #region 数据去重5 @. y% ~ m- r5 Z
- /// <summary>% ~4 I4 m/ C$ [! t: X
- /// List<string>去重
9 e3 m* [ ]* V/ f% i7 a2 w, }4 d - /// </summary>
: V& @& d( K. i4 B: m( O% m - /// <param name="list"></param>
5 x6 J: s/ b! w# R* k - /// <returns></returns>) B. y0 q& K$ I3 D7 A* k
- public static List<string> getUnqueList(List<string> list)
' ^% r I" w4 \$ B1 N - {' o; ~1 F! M* g; f5 @
- List<string> list1 = new List<string>();
4 H. G7 r: i3 b7 ~4 O. [( [ - Hashtable hash = new Hashtable();* d0 P* ]9 v' N/ _+ a$ `8 \
- foreach (string s in list)
6 x9 t' e6 r! z) {0 p - {
5 t. k4 h2 i( \& v - if (!hash.ContainsKey(s))
7 W" @# P" }/ A$ |4 [+ ^4 }: l: ~5 @ - {
1 f- f# U% Q) q* w/ x - hash.Add(s, s);
9 ? V( f% u% p0 ?; ^. u - list1.Add(s);
/ ~' Y; J9 g W; f - }/ B7 A0 Z$ r9 X: P$ J, G0 j5 |2 p- \
- }0 u9 g% x, o2 o' |) \" i
- hash.Clear();
6 ?! R& z: l+ ~# Q7 m- K - hash = null;* ~1 \ T# n3 I' ^7 O2 B
- return list1;
( U+ ~: M3 D" U. S. ^1 d - }9 S& e b! P, e0 w
- #endregion
复制代码 9 I. u- u3 b" S; f5 G& G
& j# z& k: y' \, t |
|