提高刮板速度



我正在创建一个小程序来抓取代理,它工作正常,唯一的缺点是它需要太长时间,我试图使用并行来缩短时间,但它仍然很慢,是否有任何方法来加快这个过程?

Stopwatch stopwatch = new Stopwatch();
string proxy = "";
int x = 0;
Console.Title = "Scraped proxies: 0";
string apisUnParsed =
"http://proxydb.net/nhttp://www.cybersyndrome.net/pla.htmlnhttp://www.proxz.com/proxy_list_ca_0.htmlnhttp://www.proxz.com/proxy_list_high_anonymous_0.htmlnhttp://proxy.ipcn.org/proxylist2.htmlnhttp://torvpn.com/proxylist.htmlnhttp://www.proxz.com/proxy_list_anonymous_us_0.htmlnhttp://www.proxz.com/proxy_list_cn_ssl_0.htmlnhttp://www.proxz.com/proxy_list_jp_0.htmlnhttp://www.proxz.com/proxy_list_uk_0.htmlnhttp://dogdev.net/Proxy/US?port=80nhttp://www.atomintersoft.com/products/alive-proxy/proxy-list/nhttp://www.atomintersoft.com/anonymous_proxy_listnhttp://www.proxz.com/proxy_list_fr_0.htmlnhttp://www.atomintersoft.com/high_anonymity_elite_proxy_listnhttp://dogdev.net/Proxy/allnhttp://www.proxylists.net/nhttp://www.httptunnel.ge/ProxyListForFree.aspxnhttp://www.proxylists.net/proxylist.shtml?HTTPnhttp://anon-proxy.ru/|html|0nhttp://proxies.my-proxy.com/proxy-list-1.htmlnhttp://globalproxies.blogspot.com/nhttp://proxies.my-proxy.com/proxy-list-2.htmlnhttp://anon-proxy.ru/nhttp://www.socks24.org/feeds/posts/defaultnhttp://www.proxylists.net/http.txtnhttp://aa8.narod.ru/index/0-9nhttp://www.proxylists.net/http_highanon.txtnhttp://proxylists.net/http.txtnhttp://free-proxy-list.net/anonymous-proxy.htmlnhttp://proxylists.net/http_highanon.txtnhttp://ab57.ru/downloads/proxylist.txtnhttp://www.us-proxy.org/nhttps://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list.txtnhttp://free-socks24.blogspot.in//nhttp://globalproxies.blogspot.com/search/label/US%20Proxiesnhttp://freepremiumproxy.blogspot.comnhttp://aa8.narod.ru/index/0-10nhttp://proxysearcher.sourceforge.net/Proxy%20List.php%3Ftype%3Dhttpnhttp://rootjazz.com/proxies/proxies.txtnhttps://chinaproxylist.wordpress.com/feed/nhttp://sslproxies24.blogspot.nl/feeds/posts/defaultnhttp://www.sslproxies24.top/feeds/posts/defaultnhttp://proxy-heaven.blogspot.com/nhttp://sslproxies24.blogspot.ca/feeds/posts/defaultnhttp://aa8.narod.ru/index/0-8nhttps://free-socks24.blogspot.in/feeds/posts/default?alt=rssnhttp://free-socks24.blogspot.in/feeds/posts/default?alt=rssnhttp://alexa.lr2b.com/proxylist.txtnhttp://absentius.narod.ru/nhttps://autoproxyblog.wordpress.com/feed/nhttp://www.changeips.com/nhttp://mmm-downloads.at.ua/blognhttp://feeds.feedburner.com/AnonymousDailyProxyListnhttp://freeproxylistsdaily.blogspot.in/feeds/posts/defaultnhttp://proxyserverlist-24.blogspot.com/feeds/posts/defaultnhttp://proxy-hunter.blogspot.com/feeds/posts/defaultnhttps://proxy50-50.blogspot.com/nhttp://free-fresh-proxy-daily.blogspot.com/feeds/posts/defaultnhttp://rootjazz.com/proxies/proxies.txtnhttp://www.live-socks.net/feeds/posts/defaultnhttp://www.socks24.org/feeds/posts/defaultnhttp://www.proxyserverlist24.top/feeds/posts/defaultnhttp://proxysearcher.sourceforge.net/Proxy%20List.php?type=httpnhttps://free-proxy-list.net/nhttps://proxy-spider.com/api/proxies.example.txtnhttp://proxysearcher.sourceforge.net/Proxy%20List.php?type=socks"nhttp://proxysearcher.sourceforge.net/Proxy%20List.phpnhttps://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txtnhttps://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks4.txtnhttps://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txtnhttps://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/http.txtnhttps://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks5.txtnhttps://raw.githubusercontent.com/hookzof/socks5_list/master/proxy.txtnhttps://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txtnhttps://proxysource.org/api/proxies/getWorkingProxies?apiToken=17580e4438910c287cef15dca10b7912a26&latencyMax=10000&latencyMin=0&outputMode=plaintextnhttp://spys.me/proxy.txtnhttps://api.proxyscrape.com/?request=getproxies&proxytype=all&country=all&ssl=all&anonymity=allnhttps://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txtnhttps://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txtnhttp://pubproxy.com/api/proxy?type=http&format=txt&limit=5nhttp://pubproxy.com/api/proxy?type=http&format=txt&limit=5&https=truenhttps://www.proxy-list.download/api/v1/get?type=httpnhttps://www.proxy-list.download/api/v1/get?type=httpsnhttps://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all&ssl=all&anonymity=allnhttps://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txtnhttp://pubproxy.com/api/proxy?type=socks4&format=txt&limit=5nhttps://www.proxy-list.download/api/v1/get?type=socks4nhttps://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks4&timeout=10000&country=all&anonymity=allnhttps://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txtnhttp://pubproxy.com/api/proxy?type=socks5&format=txt&limit=5nhttps://www.proxy-list.download/api/v1/get?type=socks5nhttps://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks5&timeout=10000&country=all&anonymity=all";
string[] APIS = apisUnParsed.Split('n');
List<string> proxiesScraped = new List<string>();
WebClient connect = new WebClient();
connect.Headers["User-Agent"] =
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)";
stopwatch.Start();
for (int i = 0; i < APIS.Length; i++)
{

Console.WriteLine($"rnTHIS API IS: {APIS[i]}");
try
{
proxy += connect.DownloadString(APIS[i]);
Parallel.For(0, Regex.Matches(proxy, @"d+.d+.d+.d+:d+").Count, new ParallelOptions { MaxDegreeOfParallelism = Convert.ToInt32(Math.Ceiling((Environment.ProcessorCount * 0.75) * 3.0)) }, j =>
{
Console.WriteLine(Regex.Matches(proxy, @"d+.d+.d+.d+:d+")[j]);
proxiesScraped.Add(Regex.Matches(proxy, @"d+.d+.d+.d+:d+")[j].ToString());
x++;
Console.Title = $"Scraped proxies: {x.ToString()}";
});
proxy = "";
}
catch (Exception e)
{
}
}
stopwatch.Stop();
Console.WriteLine(stopwatch.Elapsed.TotalSeconds);
Console.ReadLine();

在我的机器上执行大约需要2500ms。

private static async Task Main()
{
var proxyUrlToParse =
"http://proxydb.net/nhttp://www.cybersyndrome.net/pla.htmlnhttp://www.proxz.com/proxy_list_ca_0.htmlnhttp://www.proxz.com/proxy_list_high_anonymous_0.htmlnhttp://proxy.ipcn.org/proxylist2.htmlnhttp://torvpn.com/proxylist.htmlnhttp://www.proxz.com/proxy_list_anonymous_us_0.htmlnhttp://www.proxz.com/proxy_list_cn_ssl_0.htmlnhttp://www.proxz.com/proxy_list_jp_0.htmlnhttp://www.proxz.com/proxy_list_uk_0.htmlnhttp://dogdev.net/Proxy/US?port=80nhttp://www.atomintersoft.com/products/alive-proxy/proxy-list/nhttp://www.atomintersoft.com/anonymous_proxy_listnhttp://www.proxz.com/proxy_list_fr_0.htmlnhttp://www.atomintersoft.com/high_anonymity_elite_proxy_listnhttp://dogdev.net/Proxy/allnhttp://www.proxylists.net/nhttp://www.httptunnel.ge/ProxyListForFree.aspxnhttp://www.proxylists.net/proxylist.shtml?HTTPnhttp://anon-proxy.ru/|html|0nhttp://proxies.my-proxy.com/proxy-list-1.htmlnhttp://globalproxies.blogspot.com/nhttp://proxies.my-proxy.com/proxy-list-2.htmlnhttp://anon-proxy.ru/nhttp://www.socks24.org/feeds/posts/defaultnhttp://www.proxylists.net/http.txtnhttp://aa8.narod.ru/index/0-9nhttp://www.proxylists.net/http_highanon.txtnhttp://proxylists.net/http.txtnhttp://free-proxy-list.net/anonymous-proxy.htmlnhttp://proxylists.net/http_highanon.txtnhttp://ab57.ru/downloads/proxylist.txtnhttp://www.us-proxy.org/nhttps://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list.txtnhttp://free-socks24.blogspot.in//nhttp://globalproxies.blogspot.com/search/label/US%20Proxiesnhttp://freepremiumproxy.blogspot.comnhttp://aa8.narod.ru/index/0-10nhttp://proxysearcher.sourceforge.net/Proxy%20List.php%3Ftype%3Dhttpnhttp://rootjazz.com/proxies/proxies.txtnhttps://chinaproxylist.wordpress.com/feed/nhttp://sslproxies24.blogspot.nl/feeds/posts/defaultnhttp://www.sslproxies24.top/feeds/posts/defaultnhttp://proxy-heaven.blogspot.com/nhttp://sslproxies24.blogspot.ca/feeds/posts/defaultnhttp://aa8.narod.ru/index/0-8nhttps://free-socks24.blogspot.in/feeds/posts/default?alt=rssnhttp://free-socks24.blogspot.in/feeds/posts/default?alt=rssnhttp://alexa.lr2b.com/proxylist.txtnhttp://absentius.narod.ru/nhttps://autoproxyblog.wordpress.com/feed/nhttp://www.changeips.com/nhttp://mmm-downloads.at.ua/blognhttp://feeds.feedburner.com/AnonymousDailyProxyListnhttp://freeproxylistsdaily.blogspot.in/feeds/posts/defaultnhttp://proxyserverlist-24.blogspot.com/feeds/posts/defaultnhttp://proxy-hunter.blogspot.com/feeds/posts/defaultnhttps://proxy50-50.blogspot.com/nhttp://free-fresh-proxy-daily.blogspot.com/feeds/posts/defaultnhttp://rootjazz.com/proxies/proxies.txtnhttp://www.live-socks.net/feeds/posts/defaultnhttp://www.socks24.org/feeds/posts/defaultnhttp://www.proxyserverlist24.top/feeds/posts/defaultnhttp://proxysearcher.sourceforge.net/Proxy%20List.php?type=httpnhttps://free-proxy-list.net/nhttps://proxy-spider.com/api/proxies.example.txtnhttp://proxysearcher.sourceforge.net/Proxy%20List.php?type=socks"nhttp://proxysearcher.sourceforge.net/Proxy%20List.phpnhttps://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txtnhttps://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks4.txtnhttps://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txtnhttps://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/http.txtnhttps://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks5.txtnhttps://raw.githubusercontent.com/hookzof/socks5_list/master/proxy.txtnhttps://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txtnhttps://proxysource.org/api/proxies/getWorkingProxies?apiToken=17580e4438910c287cef15dca10b7912a26&latencyMax=10000&latencyMin=0&outputMode=plaintextnhttp://spys.me/proxy.txtnhttps://api.proxyscrape.com/?request=getproxies&proxytype=all&country=all&ssl=all&anonymity=allnhttps://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txtnhttps://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txtnhttp://pubproxy.com/api/proxy?type=http&format=txt&limit=5nhttp://pubproxy.com/api/proxy?type=http&format=txt&limit=5&https=truenhttps://www.proxy-list.download/api/v1/get?type=httpnhttps://www.proxy-list.download/api/v1/get?type=httpsnhttps://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all&ssl=all&anonymity=allnhttps://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txtnhttp://pubproxy.com/api/proxy?type=socks4&format=txt&limit=5nhttps://www.proxy-list.download/api/v1/get?type=socks4nhttps://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks4&timeout=10000&country=all&anonymity=allnhttps://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txtnhttp://pubproxy.com/api/proxy?type=socks5&format=txt&limit=5nhttps://www.proxy-list.download/api/v1/get?type=socks5nhttps://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks5&timeout=10000&country=all&anonymity=all";
string[] urls = proxyUrlToParse.Split('n');
var blockingCollection = new BlockingCollection<string>();
var client = new HttpClient();
var sw = Stopwatch.StartNew();

IEnumerable<Task> producerTasks = urls.Select(url => Task.Run(async () =>
{
try
{
blockingCollection.Add(await client.GetStringAsync(url));
}
catch (Exception e)
{
//Console.WriteLine(e.Message);
}
})).ToArray();
Task<List<string>> consumerTask = Task.Run(() => blockingCollection
.GetConsumingEnumerable()
.AsParallel()
.WithMergeOptions(ParallelMergeOptions.NotBuffered)
.WithDegreeOfParallelism(3)
.SelectMany(c =>
{
return Regex.Matches(c, @"d+.d+.d+.d+:d+").Select(m => m.Value);
}).ToList());
await Task.WhenAll(producerTasks);
blockingCollection.CompleteAdding();
List<string> proxies = await consumerTask;
Console.WriteLine($"Completed fetching {proxies.Count} proxies in {sw.ElapsedMilliseconds}ms");
}

最新更新