如何在 Python 中获取开始/结束 HTML 标记的位置



如何在 Python3 上解决这个问题,使用什么库,使用什么示例代码?

我有 html

文件,在位置 行:Col 我有 html 标签的中间 <table ......> ;如何获取<table>标签边缘的位置(括号< >)及其</table>标签边缘的位置?

(注意:多个表标签可能一个在另一个表中)。

就像这个 SO 答案中所说的那样,您不应该使用正则表达式来解析 HTML 文件,因为标准是非常不规则的。您应该改用 HTML 解析库,例如 html.parser :此库为您提供HTMLParser.getpos(),它返回标记的行号和偏移量。

这会使用 html.parser 获取每个标签的坐标,我通过简单的修改对 goahead 函数进行猴子修补,调用自定义方法get_endpos

from html.parser import HTMLParser, starttagopen
from html import unescape
class MyHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.start_tags = []
        self.end_tags = []
        self.last_append = []
    def handle_starttag(self, tag, attrs):
        self.start_tags.append((tag, (self.getpos()[0], self.getpos()[1]),))
        self.last_append = self.start_tags
    def handle_endtag(self, tag):
        self.end_tags.append((tag, (self.getpos()[0], self.getpos()[1]),))
        self.last_append = self.end_tags
    def get_endpos(self):
        self.last_append[-1] = self.last_append[-1] + ((self.getpos()[0], self.getpos()[1]),)
    def get_tags(self):
        return self.start_tags, self.end_tags
    def _reset(self):
        HTMLParser.reset(self)
        self.start_tags = []
        self.end_tags = []
parser = MyHTMLParser()
# Internal -- handle data as far as reasonable.  May leave state
# and data to be processed by a subsequent call.  If 'end' is
# true, force handling all data as if followed by EOF marker.
def goahead(self, end):
    rawdata = self.rawdata
    i = 0
    n = len(rawdata)
    while i < n:
        if self.convert_charrefs and not self.cdata_elem:
            j = rawdata.find('<', i)
            if j < 0:
                # if we can't find the next <, either we are at the end
                # or there's more text incoming.  If the latter is True,
                # we can't pass the text to handle_data in case we have
                # a charref cut in half at end.  Try to determine if
                # this is the case before proceeding by looking for an
                # & near the end and see if it's followed by a space or ;.
                amppos = rawdata.rfind('&', max(i, n-34))
                if (amppos >= 0 and
                    not re.compile(r'[s;]').search(rawdata, amppos)):
                    break  # wait till we get all the text
                j = n
        else:
            match = self.interesting.search(rawdata, i)  # < or &
            if match:
                j = match.start()
            else:
                if self.cdata_elem:
                    break
                j = n
        if i < j:
            if self.convert_charrefs and not self.cdata_elem:
                self.handle_data(unescape(rawdata[i:j]))
            else:
                self.handle_data(rawdata[i:j])
        i = self.updatepos(i, j)
        if i == n: break
        startswith = rawdata.startswith
        if startswith('<', i):
            if starttagopen.match(rawdata, i): # < + letter
                k = self.parse_starttag(i)
            elif startswith("</", i):
                k = self.parse_endtag(i)
            elif startswith("<!--", i):
                k = self.parse_comment(i)
            elif startswith("<?", i):
                k = self.parse_pi(i)
            elif startswith("<!", i):
                k = self.parse_html_declaration(i)
            elif (i + 1) < n:
                self.handle_data("<")
                k = i + 1
            else:
                break
            if k < 0:
                if not end:
                    break
                k = rawdata.find('>', i + 1)
                if k < 0:
                    k = rawdata.find('<', i + 1)
                    if k < 0:
                        k = i + 1
                else:
                    k += 1
                if self.convert_charrefs and not self.cdata_elem:
                    self.handle_data(unescape(rawdata[i:k]))
                else:
                    self.handle_data(rawdata[i:k])
            i = self.updatepos(i, k)
            self.get_endpos() # only modification: gets end position of tags
        elif startswith("&#", i):
            match = charref.match(rawdata, i)
            if match:
                name = match.group()[2:-1]
                self.handle_charref(name)
                k = match.end()
                if not startswith(';', k-1):
                    k = k - 1
                i = self.updatepos(i, k)
                continue
            else:
                if ";" in rawdata[i:]:  # bail by consuming &#
                    self.handle_data(rawdata[i:i+2])
                    i = self.updatepos(i, i+2)
                break
        elif startswith('&', i):
            match = entityref.match(rawdata, i)
            if match:
                name = match.group(1)
                self.handle_entityref(name)
                k = match.end()
                if not startswith(';', k-1):
                    k = k - 1
                i = self.updatepos(i, k)
                continue
            match = incomplete.match(rawdata, i)
            if match:
                # match.group() will contain at least 2 chars
                if end and match.group() == rawdata[i:]:
                    k = match.end()
                    if k <= i:
                        k = n
                    i = self.updatepos(i, i + 1)
                # incomplete
                break
            elif (i + 1) < n:
                # not the end of the buffer, and can't be confused
                # with some other construct
                self.handle_data("&")
                i = self.updatepos(i, i + 1)
            else:
                break
        else:
            assert 0, "interesting.search() lied"
    # end while
    if end and i < n and not self.cdata_elem:
        if self.convert_charrefs and not self.cdata_elem:
            self.handle_data(unescape(rawdata[i:n]))
        else:
            self.handle_data(rawdata[i:n])
        i = self.updatepos(i, n)
    self.rawdata = rawdata[i:]
MyHTMLParser.goahead = goahead
parser.feed(your_html_file_as_a_string)
print(parser.get_tags())

最新更新