我正在尝试使用C#
编写基于控制台的网络爬虫,并希望能够使用Google的搜索栏并搜索关键字,我发现了这个问题。但这使用的是 Windows 应用程序,所以我认为它与基于控制台的应用程序不同..?我怎样才能以足够简单的方式做到这一点,它与使用 Windows 应用程序的概念相同吗?
源:
using System;
using System.Collections.Specialized;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
namespace Crawler
{
//Create information handling
public interface IWidow
{
string Say(string input);
string Success(string input);
string MinorErr(string input);
string FatalErr(string input);
string Debug(string input);
}
/*
* Intiate the information handling
* and create the color coordination.
*/
public class ConsoleInformative : IWidow
{
public string Say(string input)
{
Console.ForegroundColor = ConsoleColor.Gray;
Console.WriteLine($"[{DateTime.Now.ToString("h:mm:ss tt")}] {input}");
return input;
}
public string Success(string input)
{
Console.ForegroundColor = ConsoleColor.Green;
Console.WriteLine($"[{DateTime.Now.ToString("h:mm:ss tt")}] {input}");
return input;
}
public string MinorErr(string input)
{
Console.ForegroundColor = ConsoleColor.DarkYellow;
Console.WriteLine($"[{DateTime.Now.ToString("h:mm:ss tt")}] {input}");
return input;
}
public string FatalErr(string input)
{
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine($"[{DateTime.Now.ToString("h:mm:ss tt")}] {input}");
return input;
}
public string Debug(string input)
{
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine($"[{DateTime.Now.ToString("h:mm:ss tt")}] {input}");
return input;
}
}
class BlackWidow
{
/*
* Make a request to the web host in
* this case it is Google.
*/
private static string GetWebInfo(string url)
{
string logPath = $@"{Path.GetDirectoryName(System.AppDomain.CurrentDomain.BaseDirectory)}loghtml.txt";
string errPath = $@"{Path.GetDirectoryName(System.AppDomain.CurrentDomain.BaseDirectory)}logerror.txt";
HttpWebRequest requests = (HttpWebRequest)HttpWebRequest.Create(url);
requests.UserAgent = "A .NET Web Crawler";
IWebProxy proxy = requests.Proxy;
IWidow info = new ConsoleInformative();
/*
* Used cached credentials to access
* proxy if there is one.
*/
info.Say("Checking if you're behind a proxy");
if (proxy != null)
{
try
{
info.Say("Proxy found attempting to login with cached credentials..");
string proxyUri = proxy.GetProxy(requests.RequestUri).ToString();
requests.UseDefaultCredentials = true;
requests.Proxy = new WebProxy(proxyUri, false);
requests.Proxy.Credentials = System.Net.CredentialCache.DefaultCredentials;
}
/*
* Catch exception if hte cached
* credentials fail to load.
*/
catch (Exception e)
{
info.FatalErr("Unable to verify cached credentials..");
File.WriteAllText($"{errPath}", e.ToString());
info.Debug("Wrote error to file for further analysis, exiting process..");
}
}
info.Success("Logged in with cached credentials, continuing process.");
WebResponse providedResponse = requests.GetResponse();
Stream stream = providedResponse.GetResponseStream();
StreamReader readInformation = new StreamReader(stream);
string htmlOutput = readInformation.ReadToEnd();
File.WriteAllText($"{logPath}", htmlOutput);
return htmlOutput;
}
//Main method
static void Main(string[] args)
{
IWidow info = new ConsoleInformative();
try
{
string searchQuery = "test";
string searchEngine = "https://google.com";
NameValueCollection search = new NameValueCollection();
Regex linkParser = new Regex(@"b(?:https?://|www.)S+b", RegexOptions.Compiled | RegexOptions.IgnoreCase);
info.Say("Attempting to connect to the site..");
GetWebInfo(searchEngine);
info.Success($"Connected to site, writing HTML to file, and searching {searchEngine} with query {searchQuery}.");
search.Add("q", searchQuery);
}
/*
* Catch all exceptions and write them
* to a file for futher analysis if any
* occur during the process.
*/
catch (Exception e)
{
var filePath = AppDomain.CurrentDomain.BaseDirectory;
info.FatalErr($"Exception thrown: {e}");
File.WriteAllText($@"{filePath}errorlog.LOG", e.ToString());
info.Debug($"Wrote Exception to file located in {filePath}");
}
}
}
}
下面的代码在控制台应用程序中运行,但我收到代理错误。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections.Specialized;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
namespace Crawler
{
//Create information handling
public interface IWidow
{
string Say(string input);
string Success(string input);
string MinorErr(string input);
string FatalErr(string input);
string Debug(string input);
}
/*
* Intiate the information handling
* and create the color coordination.
*/
public class ConsoleInformative : IWidow
{
public string Say(string input)
{
Console.ForegroundColor = ConsoleColor.Gray;
Console.WriteLine("[{0}] {1}",DateTime.Now.ToString("h:mm:ss tt"),input);
return input;
}
public string Success(string input)
{
Console.ForegroundColor = ConsoleColor.Green;
Console.WriteLine("[{0}] {1}", DateTime.Now.ToString("h:mm:ss tt"), input);
return input;
}
public string MinorErr(string input)
{
Console.ForegroundColor = ConsoleColor.DarkYellow;
Console.WriteLine("[{0}] {1}", DateTime.Now.ToString("h:mm:ss tt"), input);
return input;
}
public string FatalErr(string input)
{
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine("[{0}] {0}", DateTime.Now.ToString("h:mm:ss tt"), input);
return input;
}
public string Debug(string input)
{
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine("[{0}] {1}",DateTime.Now.ToString("h:mm:ss tt"), input);
return input;
}
}
public class BlackWidow
{
public BlackWidow(string url)
{
GetWebInfo(url);
}
/*
* Make a request to the web host in
* this case it is Google.
*/
private static string GetWebInfo(string url)
{
string logPath = string.Format(@"{0}html.txt", Path.GetDirectoryName(System.AppDomain.CurrentDomain.BaseDirectory));
string errPath = string.Format(@"{0}error.txt", Path.GetDirectoryName(System.AppDomain.CurrentDomain.BaseDirectory));
HttpWebRequest requests = (HttpWebRequest)HttpWebRequest.Create(url);
requests.ProtocolVersion = HttpVersion.Version10;
requests.UserAgent = "A .NET Web Crawler";
IWebProxy proxy = requests.Proxy;
IWidow info = new ConsoleInformative();
/*
* Used cached credentials to access
* proxy if there is one.
*/
info.Say("Checking if you're behind a proxy");
if (proxy != null)
{
try
{
info.Say("Proxy found attempting to login with cached credentials..");
string proxyUri = proxy.GetProxy(requests.RequestUri).ToString();
requests.UseDefaultCredentials = true;
requests.Proxy = new WebProxy(proxyUri, false);
requests.Proxy.Credentials = System.Net.CredentialCache.DefaultCredentials;
}
/*
* Catch exception if hte cached
* credentials fail to load.
*/
catch (Exception e)
{
info.FatalErr("Unable to verify cached credentials..");
File.WriteAllText(errPath, e.ToString());
info.Debug("Wrote error to file for further analysis, exiting process..");
}
}
info.Success("Logged in with cached credentials, continuing process.");
WebResponse providedResponse = requests.GetResponse();
Stream stream = providedResponse.GetResponseStream();
StreamReader readInformation = new StreamReader(stream);
string htmlOutput = readInformation.ReadToEnd();
File.WriteAllText(logPath, htmlOutput);
return htmlOutput;
}
}
class Program
{
static void Main(string[] args)
{
IWidow info = new ConsoleInformative();
try
{
string searchQuery = "test";
string searchEngine = "https://google.com";
NameValueCollection search = new NameValueCollection();
Regex linkParser = new Regex(@"b(?:https?://|www.)S+b", RegexOptions.Compiled | RegexOptions.IgnoreCase);
info.Say("Attempting to connect to the site..");
BlackWidow blackWidow = new BlackWidow(searchEngine);
info.Success(string.Format("Connected to site, writing HTML to file, and searching {0} with query {1}.", searchEngine,searchQuery));
search.Add("q", searchQuery);
}
/*
* Catch all exceptions and write them
* to a file for futher analysis if any
* occur during the process.
*/
catch (Exception e)
{
var filePath = AppDomain.CurrentDomain.BaseDirectory;
info.FatalErr(string.Format("Exception thrown: {0}", e.ToString()));
File.WriteAllText(string.Format(@"{0}errorlog.LOG",filePath), e.ToString());
info.Debug(string.Format("Wrote Exception to file located in {0}",filePath));
}
Console.ReadLine();
}
}
}