使用 Boost Spirit X3 解析 Html

我正在尝试编写一个解析器来解析带有boost spirit x3的html，我在下面编写了解析器：

问题是这些代码无法编译。错误是：

致命错误 C1202：递归类型或函数依赖项上下文过于复杂

我知道这个错误是因为我的解析器html_element_引用tag_block_，tag_block_ 参考html_element_，但我不知道如何使其工作。

#include <boost/spirit/home/x3.hpp>
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/home/x3/support/ast/position_tagged.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <iostream>
using namespace boost::spirit::x3;
struct tag_name{};
struct html_tag;
struct html_comment;
struct attribute_data : boost::spirit::x3::position_tagged {
std::string name;
boost::optional<std::string> value;
};

struct tag_header :  boost::spirit::x3::position_tagged {
std::string name;
std::vector<attribute_data> attributes;
};
struct self_tag: boost::spirit::x3::position_tagged {
tag_header header;
};
struct html_element : boost::spirit::x3::position_tagged, boost::spirit::x3::variant< std::string, self_tag, boost::recursive_wrapper<html_tag>>{
using base_type::base_type;
using base_type::operator=;
};

struct html_tag: boost::spirit::x3::position_tagged {
tag_header header;
std::vector<html_element> children;
};
BOOST_FUSION_ADAPT_STRUCT(attribute_data, name, value);
BOOST_FUSION_ADAPT_STRUCT(tag_header, name, attributes);
BOOST_FUSION_ADAPT_STRUCT(self_tag, header);
BOOST_FUSION_ADAPT_STRUCT(html_tag,header,children);
// These are the attributes parser, seems fine
struct attribute_parser_id;
auto attribute_identifier_= rule<attribute_parser_id, std::string>{"AttributeIdentifier"} = lexeme[+(char_ - char_(" /=>"))];
auto attribute_value_= rule<attribute_parser_id, std::string>{"AttributeValue"} =
lexeme[""" > +(char_ - char_(""")) > """]|lexeme["'" > +(char_ - char_("'")) > "'"]|
lexeme[+(char_ - char_(" />"))];
auto single_attribute_ = rule<attribute_parser_id, attribute_data>{"SingleAttribute"} = attribute_identifier_ > -("=">  attribute_value_);
auto attributes_ = rule<attribute_parser_id, std::vector<attribute_data>>{"Attributes"} = (*single_attribute_);

struct tag_parser_id;

auto tag_name_begin_func = [](auto &ctx){
get<tag_name>(ctx) = _attr(ctx).name;
//_val(ctx).header.name = _attr(ctx);
std::cout << typeid(_val(ctx)).name() << std::endl;
};
auto tag_name_end_func = [](auto &ctx){
_pass(ctx) = get<tag_name>(ctx) == _attr(ctx);
};
auto self_tag_name_action = [](auto &ctx){
_val(ctx).header.name = _attr(ctx);
};
auto self_tag_attribute_action = [](auto &ctx){
_val(ctx).header.attributes = _attr(ctx);
};
auto inner_text = lexeme[+(char_-'<')];
auto tag_name_ = rule<tag_parser_id, std::string>{"HtmlTagName"} = lexeme[*(char_ - char_(" />"))];
auto self_tag_ = rule<tag_parser_id, self_tag>{"HtmlSelfTag"} = '<' > tag_name_[self_tag_name_action] > attributes_[self_tag_attribute_action] > "/>";
auto tag_header_ = rule<tag_parser_id, tag_header>{"HtmlTagBlockHeader"} = '<' > tag_name_ > attributes_ > '>';
rule<tag_parser_id, html_tag> tag_block_;
rule<tag_parser_id, html_element> html_element_ = "HtmlElement";
auto tag_block__def = with<tag_name>(std::string())[tag_header_[tag_name_begin_func] > (*html_element_) > "</" > omit[tag_name_[tag_name_end_func]] > '>'];
auto html_element__def = inner_text | self_tag_ | tag_block_ ;
BOOST_SPIRIT_DEFINE(tag_block_, html_element_);
int main()
{
std::string source = "<div data-src="https://www.google.com" id='hello world'></div>";
html_element result;
auto const parser = html_element_;
auto parse_result = phrase_parse(source.begin(), source.end(), parser, ascii::space, result);
}

我尝试阅读官方文档和 x3 官方文档中的 boost：spirit：qi 示例，该解析器只是解析标签，而不是属性。x3 官方文档中的示例不同，我认为在我的情况下更难;

在阅读时，我注意到的第一件事是self_tag_使用了期望点。这不会飞，因为它是在其他可以合法地以<开头的东西之前订购的，例如tag_block_：

auto html_element__def = inner_text | self_tag_ | tag_block_ ;

由于期望点，它永远不会退缩以达到这一点。

许多地方在需要operator*的地方使用operator+，例如：

auto inner_text = lexeme[*(char_-'<')];

所有这些字符集差异都可以表述为反集：

auto inner_text = lexeme[*~char_('<')];
//
= lexeme[*~char_(" />")];

除了XML具有特定的有效字符集(例如元素名称)这一事实之外，我假设您明确希望避免编写一致的解析器。具体来说，您确实需要从属性名称/值规则等中排除"<"，">"，"\r"，"\t"等。

一种气味是解析器规则标记的重用。据我了解，这应该适用于立即定义的规则，但对于通过其标签类型定义的规则，BOOST_SPIRIT_DEFINE肯定不行。

清理练习

首先，清理。这通过在tag_block__def内部注释掉*html_element_来克服模板实例化深度的障碍。但首先让我们看看什么有效：

住在科里鲁

//#define BOOST_SPIRIT_X3_DEBUG
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <iomanip>
#include <iostream>
//// Unused mixin disabled for simplicity
// #include <boost/spirit/home/x3/support/ast/position_tagged.hpp>
namespace x3 = boost::spirit::x3;
using namespace std::string_literals;
namespace Ast {
struct tag_name {};
struct html_tag;
struct html_comment;
// using mixin = x3::position_tagged;
struct mixin {};
struct attribute_data : mixin {
std::string                  name;
boost::optional<std::string> value;
};
using attribute_datas = std::vector<attribute_data>;
struct tag_header : mixin {
std::string     name;
attribute_datas attributes;
};
struct self_tag : mixin {
tag_header header;
};
using element_base =
x3::variant<std::string, self_tag, boost::recursive_wrapper<html_tag>>;
struct html_element : mixin , element_base {
using element_base::element_base;
using element_base::operator=;
};
using html_elements = std::vector<html_element>;
struct html_tag : mixin {
tag_header    header;
html_elements children;
};
} // namespace Ast
BOOST_FUSION_ADAPT_STRUCT(Ast::attribute_data, name, value)
BOOST_FUSION_ADAPT_STRUCT(Ast::tag_header, name, attributes)
BOOST_FUSION_ADAPT_STRUCT(Ast::self_tag, header)
BOOST_FUSION_ADAPT_STRUCT(Ast::html_tag, header, children)
namespace Parser {
auto attribute_identifier_                                                         //
= x3::rule<struct AttributeIdentifier_tag, std::string>{"AttributeIdentifier"} //
= x3::lexeme[+~x3::char_(" /=>")];
auto attribute_value_                                                    //
= x3::rule<struct AttributeValue_tag, std::string>{"AttributeValue"} //
= x3::lexeme                                                             //
[('"' > *~x3::char_('"') > '"')                                      //
| ("'" > *~x3::char_("'") > "'")                                    //
| *~x3::char_(" />")                                                //
];
auto single_attribute_ =
x3::rule<struct attribute_identifier__tag, Ast::attribute_data>{"SingleAttribute"} //
= attribute_identifier_ >> -("=" >> attribute_value_);
auto attributes_                                                              //
= x3::rule<struct attribute_data_tag, Ast::attribute_datas>{"Attributes"} //
= *single_attribute_;
[[maybe_unused]] static auto& header_of(x3::unused_type) {
thread_local Ast::tag_header s_dummy;
return s_dummy;
}
[[maybe_unused]] static auto& header_of(Ast::html_tag& ht) {
return ht.header;
}
auto tag_name_begin_func = [](auto &ctx){
get<Ast::tag_name>(ctx) = _attr(ctx).name;
// header_of(_val(ctx)).name = _attr(ctx);
// std::cout << typeid(_val(ctx)).name() << std::endl;
};
auto tag_name_end_func         = [](auto& ctx){ _pass(ctx) = (get<Ast::tag_name>(ctx) == _attr(ctx)); };
auto self_tag_name_action      = [](auto &ctx){ header_of(_val(ctx)).name = _attr(ctx); };
auto self_tag_attribute_action = [](auto& ctx) { header_of(_val(ctx)).attributes = _attr(ctx); };
auto tag_name_                                                     //
= x3::rule<struct HtmlTagName_tag, std::string>{"HtmlTagName"} //
= x3::lexeme[*~x3::char_(" />")];
auto self_tag_                                                       //
= x3::rule<struct HtmlSelfTag_tag, Ast::self_tag>{"HtmlSelfTag"} //
= '<' >> tag_name_[self_tag_name_action] >> attributes_[self_tag_attribute_action] >> "/>";
auto tag_header_                                                                     //
= x3::rule<struct HtmlTagBlockHeader_tag, Ast::tag_header>{"HtmlTagBlockHeader"} //
= '<' >> tag_name_ >> attributes_ >> '>';
x3::rule<struct tag_block__tag, Ast::html_tag>        tag_block_    = "TagBlock";
x3::rule<struct html_element__tag, Ast::html_element> html_element_ = "HtmlElement";
auto tag_block__def = x3::with<Ast::tag_name>(""s)                        //
[                                                                     //
tag_header_[tag_name_begin_func] >> /**html_element_ >>*/ "</" >> //
x3::omit[tag_name_[tag_name_end_func]] >> '>'                     //
];
auto inner_text        = x3::lexeme[*~x3::char_('<')];
auto html_element__def = inner_text | self_tag_ | tag_block_;
BOOST_SPIRIT_DEFINE(tag_block_, html_element_)
}
namespace unit_tests {
template <bool ShouldSucceed = true, typename P>
void test(P const& rule, std::initializer_list<std::string_view> cases) {
for (auto input : cases) {
if constexpr (ShouldSucceed) {
typename x3::traits::attribute_of<P, x3::unused_type>::type result;
auto ok = phrase_parse(input.begin(), input.end(), rule, x3::space, result);
std::cout << quoted(input) << " -> " << (ok ? "Ok" : "FAILED") << std::endl;
} else {
auto ok = phrase_parse(input.begin(), input.end(), rule, x3::space);
if (!ok)
std::cout << "Fails as expected: " << quoted(input) << std::endl;
else
std::cout << "SHOULD HAVE FAILED: " << quoted(input) << std::endl;
}
}
}
}
int main() {
unit_tests::test(Parser::self_tag_,
{
R"(<simple foo="" bar='' value-less qux=bareword/>)",
R"(<div />)",
R"(<div/>)",
R"(< div/>)",
});
unit_tests::test(Parser::html_element_,
{
R"(<simple foo="" bar='' value-less qux=bareword></simple>)",
R"(<div ></div>)",
R"(<div></div>)",
R"(< div></div>)",
R"(< div ></div>)",
R"(<div data-src="https://www.google.com" id='hello world'></div>)",
R"(<div></ div>)",
R"(<div></ div >)",
});
unit_tests::test<false>(Parser::self_tag_,
{
R"(<div/ >)",
R"(<div>< /div>)",
R"(<div></dov>)",
});
}

输出

"<simple foo="" bar='' value-less qux=bareword/>" -> Ok   
"<div />" -> Ok
"<div/>" -> Ok
"< div/>" -> Ok
"<simple foo="" bar='' value-less qux=bareword></simple>" -> Ok
"<div ></div>" -> Ok
"<div></div>" -> Ok
"< div></div>" -> Ok
"< div ></div>" -> Ok
"<div data-src="https://www.google.com" id='hello world'></div>" -> Ok
"<div></ div>" -> Ok
"<div></ div >" -> Ok
Fails as expected: "<div/ >"
Fails as expected: "<div>< /div>"
Fails as expected: "<div></dov>"

麻烦是什么

正如你可以从我的预感中推断出注释掉递归*html_element_，这会导致问题。

真正的原因是with<>扩展了上下文。这意味着每个递归级别都会向上下文类型添加更多数据，从而导致新的模板实例化。

最简单的技巧是将with<>移到递归之外：

auto tag_block__def =                                             //
tag_header_[tag_name_begin_func] >> *html_element_ >> "</" >> //
x3::omit[tag_name_[tag_name_end_func]] >> '>'                 //
;
auto inner_text        = x3::lexeme[*~x3::char_('<')];
auto html_element__def = inner_text | self_tag_ | tag_block_;
auto start             = x3::with<Ast::tag_name>(""s)[html_element_];

但是，这突出了元素可以嵌套的问题，并且当内部标记覆盖tag_name的上下文数据时，这是无用的。因此，我们可以将其设为stack<string>而不是string：

auto start = x3::with<tag_stack>(std::stack<std::string>{})[html_element_];

然后修改操作以匹配：

auto tag_name_begin_func = [](auto& ctx) { get<tag_stack>(ctx).push(_attr(ctx).name); };
auto tag_name_end_func = [](auto& ctx) {
auto& s    = get<tag_stack>(ctx);
_pass(ctx) = (s.top() == _attr(ctx));
s.pop();
};

在科里鲁现场观看

//#define BOOST_SPIRIT_X3_DEBUG
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <iomanip>
#include <iostream>
#include <stack>
//// Unused mixin disabled for simplicity
// #include <boost/spirit/home/x3/support/ast/position_tagged.hpp>
namespace x3 = boost::spirit::x3;
using namespace std::string_literals;
namespace Ast {
struct html_tag;
struct html_comment;
// using mixin = x3::position_tagged;
struct mixin {};
struct attribute_data : mixin {
std::string                  name;
boost::optional<std::string> value;
};
using attribute_datas = std::vector<attribute_data>;
struct tag_header : mixin {
std::string     name;
attribute_datas attributes;
};
struct self_tag : mixin {
tag_header header;
};
using element_base =
x3::variant<std::string, self_tag, boost::recursive_wrapper<html_tag>>;
struct html_element : mixin , element_base {
using element_base::element_base;
using element_base::operator=;
};
using html_elements = std::vector<html_element>;
struct html_tag : mixin {
tag_header    header;
html_elements children;
};
} // namespace Ast
BOOST_FUSION_ADAPT_STRUCT(Ast::attribute_data, name, value)
BOOST_FUSION_ADAPT_STRUCT(Ast::tag_header, name, attributes)
BOOST_FUSION_ADAPT_STRUCT(Ast::self_tag, header)
BOOST_FUSION_ADAPT_STRUCT(Ast::html_tag, header, children)
namespace Parser {
struct tag_stack final {};
auto attribute_identifier_                                                         //
= x3::rule<struct AttributeIdentifier_tag, std::string>{"AttributeIdentifier"} //
= x3::lexeme[+~x3::char_(" /=>")];
auto attribute_value_                                                    //
= x3::rule<struct AttributeValue_tag, std::string>{"AttributeValue"} //
= x3::lexeme                                                             //
[('"' > *~x3::char_('"') > '"')                                      //
| ("'" > *~x3::char_("'") > "'")                                    //
| *~x3::char_(" />")                                                //
];
auto single_attribute_ =
x3::rule<struct attribute_identifier__tag, Ast::attribute_data>{"SingleAttribute"} //
= attribute_identifier_ >> -("=" >> attribute_value_);
auto attributes_                                                              //
= x3::rule<struct attribute_data_tag, Ast::attribute_datas>{"Attributes"} //
= *single_attribute_;
[[maybe_unused]] static auto& header_of(x3::unused_type) {
thread_local Ast::tag_header s_dummy;
return s_dummy;
}
[[maybe_unused]] static auto& header_of(Ast::html_tag& ht) {
return ht.header;
}
auto tag_name_begin_func = [](auto& ctx) { get<tag_stack>(ctx).push(_attr(ctx).name); };
auto tag_name_end_func = [](auto& ctx) {
auto& s    = get<tag_stack>(ctx);
_pass(ctx) = (s.top() == _attr(ctx));
s.pop();
};
auto assign_name  = [](auto& ctx) { header_of(_val(ctx)).name = _attr(ctx); };
auto assign_attrs = [](auto& ctx) { header_of(_val(ctx)).attributes = _attr(ctx); };
auto tag_name_                                                     //
= x3::rule<struct HtmlTagName_tag, std::string>{"HtmlTagName"} //
= x3::lexeme[*~x3::char_(" />")];
auto self_tag_                                                       //
= x3::rule<struct HtmlSelfTag_tag, Ast::self_tag>{"HtmlSelfTag"} //
= '<' >> tag_name_[assign_name] >> attributes_[assign_attrs] >> "/>";
auto tag_header_                                                                     //
= x3::rule<struct HtmlTagBlockHeader_tag, Ast::tag_header>{"HtmlTagBlockHeader"} //
= '<' >> tag_name_ >> attributes_ >> '>';
x3::rule<struct tag_block__tag, Ast::html_tag>        tag_block_    = "TagBlock";
x3::rule<struct html_element__tag, Ast::html_element> html_element_ = "HtmlElement";
auto tag_block__def =                                             //
tag_header_[tag_name_begin_func] >> *html_element_ >> "</" >> //
x3::omit[tag_name_[tag_name_end_func]] >> '>'                 //
;
auto inner_text        = x3::lexeme[*~x3::char_('<')];
auto html_element__def = inner_text | self_tag_ | tag_block_;
auto start             = x3::with<tag_stack>(std::stack<std::string>{})[html_element_];
BOOST_SPIRIT_DEFINE(tag_block_, html_element_)
}
namespace unit_tests {
template <bool ShouldSucceed = true, typename P>
void test(P const& rule, std::initializer_list<std::string_view> cases) {
for (auto input : cases) {
if constexpr (ShouldSucceed) {
typename x3::traits::attribute_of<P, x3::unused_type>::type result;
auto ok = phrase_parse(input.begin(), input.end(), rule, x3::space, result);
std::cout << quoted(input) << " -> " << (ok ? "Ok" : "FAILED") << std::endl;
} else {
auto ok = phrase_parse(input.begin(), input.end(), rule, x3::space);
if (!ok)
std::cout << "Fails as expected: " << quoted(input) << std::endl;
else
std::cout << "SHOULD HAVE FAILED: " << quoted(input) << std::endl;
}
}
}
}
int main() {
unit_tests::test(Parser::self_tag_,
{
R"(<simple foo="" bar='' value-less qux=bareword/>)",
R"(<div />)",
R"(<div/>)",
R"(< div/>)",
});
unit_tests::test(Parser::start,
{
R"(<simple foo="" bar='' value-less qux=bareword></simple>)",
R"(<div ></div>)",
R"(<div></div>)",
R"(< div></div>)",
R"(< div ></div>)",
R"(<div data-src="https://www.google.com" id='hello world'></div>)",
R"(<div></ div>)",
R"(<div></ div >)",
R"(<div><nest/><nest some="more">yay</nest></div>)",
});
unit_tests::test<false>(Parser::self_tag_,
{
R"(<div/ >)",
R"(<div>< /div>)",
R"(<div></dov>)",
});
}

印刷

"<simple foo="" bar='' value-less qux=bareword/>" -> Ok
"<div />" -> Ok
"<div/>" -> Ok
"< div/>" -> Ok
"<simple foo="" bar='' value-less qux=bareword></simple>" -> Ok
"<div ></div>" -> Ok
"<div></div>" -> Ok
"< div></div>" -> Ok
"< div ></div>" -> Ok
"<div data-src="https://www.google.com" id='hello world'></div>" -> Ok
"<div></ div>" -> Ok
"<div></ div >" -> Ok
"<div><nest/><nest some="more">yay</nest></div>" -> Ok
Fails as expected: "<div/ >"
Fails as expected: "<div>< /div>"
Fails as expected: "<div></dov>"

结语

我回答这个问题，假设你只是为了学习 X3。否则，唯一的建议是：不要这样做。使用库。

你的语法不仅在解析XML方面做得很差，而且在野外的HTML上也会完全失败。结束标记在 HTML 中不是给定的("quirks 模式")。脚本、CDATA、实体引用、Unicode、转义都会搞砸你的解析器。

哦，你有没有注意到你是如何通过引入一些语义操作来破坏属性传播的？我可以告诉你如何修复它，但我想我宁愿暂时离开它。

只需使用库即可。

这个首字母除其他事项外，匹配开始/结束问题的解决方案标签，大大简化这里这简化仅关注"匹配开始/结束标记"子部分的问题。简化没有尝试解析字符串，相反，它只是解析 x3：uint_。这足以说明问题的子部分的解决方案，因为子部分问题的本质是将开始标签与结束标签匹配。更具体地说，推断此属性的问题表达：

auto 
tag_header_
= 
(  '<' 
>> tag_name_
>> '>'
)
#ifdef USE_SEMANTIC_ACTIONS
[tag_name_begin_func]
#endif
;

与此表达式的属性相同：

auto 
tag_footer_
= 
(  "</"
>> tag_name_ 
>> '>'
)
#ifdef USE_SEMANTIC_ACTIONS
[tag_name_end_func]
#endif
;

在视觉上比推断这个属性要简单得多表达：

auto tag_name_                                                     //
= x3::rule<struct HtmlTagName_tag, std::string>{"HtmlTagName"} //
= x3::lexeme[*~x3::char_(" />")];

与此表达式的属性相同：

"</" >> //
x3::omit[tag_name_[tag_name_end_func]] >> '>'                 //

后2个，视觉复杂，表情是从复制粘贴的这里。

此外，tag_name_和inner_text也简单得多。这源语言：

auto tag_name_                                                     //
= x3::rule<struct HtmlTagName_tag, std::string>{"HtmlTagName"} //
= x3::lexeme[*~x3::char_(" />")];
auto inner_text        = x3::lexeme[*~x3::char_('<')];

显然比简化的解决方案：

auto tag_name_
= x3::uint_;
auto inner_text        = x3::uint_;

现在，读者可能会注意到，原始解决方案包含几个赛斯称之为"立即定义的规则"的声明。 "立即定义规则"模式可能 "抽象"为：

auto RuleDef
= x3::rule<struct RuleTag, RuleAttribute>{"RuleName"}
= RuleRhs;

在此抽象中，骆驼大小写标识符是模式参数替换以创建立即定义的规则，有点像模板的表达式被实例化。在上述tag_name_实例中，以下进行了替换：

RuleDef -> tag_name_
RuleTag -> HtmlTagName_tag
RuleAttribute -> std::string
RuleName -> HtmlTagName
RuleRhs -> x3::lexeme[*~x3::char_(" />")]

但是，立即定义的规则的目的是什么？好吧，一个原因是将 RuleRhs 的属性转换为规则属性，如图所示这里。(示例可能有点难以理解，因为立即定义的规则是由于位于形成解析函数的解析器参数的表达式中而模糊。

但是，在简单化; 因此，所有立即定义的规则都被删除，作为进一步的简单化。

清理练习

麻烦是什么

结语

相关内容

最新更新

热门标签：