PrevUpHomeNext

Parsing JSON

This is a conforming JSON parser. It passes all the required tests in the JSON Test Suite, and all but 5 of the optional ones. Notice that the actual parsing bits are only about 150 lines of code.

// This header includes a type called json::value that acts as a
// Javascript-like polymorphic value type.
#include "json.hpp"

#include <boost/parser/parser.hpp>

#include <fstream>
#include <vector>
#include <climits>


namespace json {

    namespace bp = ::boost::parser;
    using namespace bp::literals;

    // The JSON spec imposes a limit on how deeply JSON data structures are
    // allowed to nest.  This exception is thrown when that limit is exceeded
    // during the parse.
    template<typename Iter>
    struct excessive_nesting : std::runtime_error
    {
        excessive_nesting(Iter it) :
            runtime_error("excessive_nesting"),
            iter(it)
        {}
        Iter iter;
    };


    // The only globals we need to parse JSON are: "How many data structures
    // deep are we?", and "What is the limit of open data structures
    // allowed?".
    struct global_state
    {
        int recursive_open_count = 0;
        int max_recursive_open_count = 0;
    };

    // When matching paired UTF-16 surrogates, we need to track a bit of state
    // between matching the first and second UTF-16 code units: namely, the
    // value of the first code unit.
    struct double_escape_locals
    {
        int first_surrogate = 0;
    };


    // Here are all the rules declared.  I've given them names that are
    // end-user friendly, so that if there is a parse error, you get a message
    // like "expected four hexadecimal digits here:", instead of "expected
    // hex_4 here:".

    bp::rule<class ws> const ws = "whitespace";

    bp::rule<class string_char, uint32_t> const string_char =
        "code point (code points <= U+001F must be escaped)";
    bp::rule<class four_hex_digits, uint32_t> const hex_4 =
        "four hexadecimal digits";
    bp::rule<class escape_seq, uint32_t> const escape_seq =
        "\\uXXXX hexadecimal escape sequence";
    bp::rule<class escape_double_seq, uint32_t, double_escape_locals> const
        escape_double_seq = "\\uXXXX hexadecimal escape sequence";
    bp::rule<class single_escaped_char, uint32_t> const single_escaped_char =
        "'\"', '\\', '/', 'b', 'f', 'n', 'r', or 't'";

    bp::rule<class null, value> const null = "null";
    bp::rule<class string, std::string> const string = "string";
    bp::rule<class number, double> const number = "number";
    bp::rule<class object_element, boost::parser::tuple<std::string, value>> const
        object_element = "object-element";
    bp::rule<class object_tag, value> const object_p = "object";
    bp::rule<class array_tag, value> const array_p = "array";

    bp::rule<class value_tag, value> const value_p = "value";



    // JSON limits whitespace to just these four characters.
    auto const ws_def = '\x09'_l | '\x0a' | '\x0d' | '\x20';

    // Since our json object representation, json::value, is polymorphic, and
    // since its default-constructed state represents the JSON value "null",
    // we need to tell a json::value that it is an object (similar to a map)
    // before we start inserting values into it.  That's why we need
    // object_init.
    auto object_init = [](auto & ctx) {
        auto & globals = _globals(ctx);
        if (globals.max_recursive_open_count < ++globals.recursive_open_count)
            throw excessive_nesting(_where(ctx).begin());
        _val(ctx) = object();
    };

    // We need object_insert because we can't just insert into the json::value
    // itself.  The json::value does not have an insert() member, because if
    // it is currently holding a number, that makes no sense.  So, for a
    // json::value x, we need to call get<object>(x) to get the object
    // interface.
    auto object_insert = [](auto & ctx) {
        value & v = _val(ctx);
        get<object>(v).insert(std::make_pair(
            std::move(_attr(ctx))[0_c], std::move(_attr(ctx)[1_c])));
    };

    // These are the array analogues of the object semantic actions above.
    auto array_init = [](auto & ctx) {
        auto & globals = _globals(ctx);
        if (globals.max_recursive_open_count < ++globals.recursive_open_count)
            throw excessive_nesting(_where(ctx).begin());
        _val(ctx) = array();
    };
    auto array_append = [](auto & ctx) {
        value & v = _val(ctx);
        get<array>(v).push_back(std::move(_attr(ctx)));
    };

    // escape_double_seq is used to match pairs of UTF-16 surrogates that form
    // a single code point.  So, after matching one UTF-16 code unit c, we
    // only want to keep going if c is a lead/high surrogate.
    auto first_hex_escape = [](auto & ctx) {
        auto & locals = _locals(ctx);
        uint32_t const cu = _attr(ctx);
        if (!boost::parser::detail::text::high_surrogate(cu))
            _pass(ctx) = false; // Not a high surrogate; explicitly fail the parse.
        else
            locals.first_surrogate = cu; // Save this initial code unit for later.
    };
    // This is also used in escape_double_seq.  When we get to this action, we
    // know we've already matched a high surrogate, and so this one had better
    // be a low surrogate, or we have a (local) parse failure.
    auto second_hex_escape = [](auto & ctx) {
        auto & locals = _locals(ctx);
        uint32_t const cu = _attr(ctx);
        if (!boost::parser::detail::text::low_surrogate(cu)) {
            _pass(ctx) = false; // Not a low surrogate; explicitly fail the parse.
        } else {
            // Success!  Write to the rule's attribute the code point that the
            // first and second code points form.
            uint32_t const high_surrogate_min = 0xd800;
            uint32_t const low_surrogate_min = 0xdc00;
            uint32_t const surrogate_offset =
                0x10000 - (high_surrogate_min << 10) - low_surrogate_min;
            uint32_t const first_cu = locals.first_surrogate;
            _val(ctx) = (first_cu << 10) + cu + surrogate_offset;
        }
    };

    // This is the verbose form of declaration for the integer and unsigned
    // integer parsers int_parser and uint_parser.  In this case, we don't
    // want to use boost::parser::hex directly, since it has a variable number
    // of digits.  We want to match exactly 4 digits, and this is how we
    // declare a hexadecimal parser that matches exactly 4.
    bp::parser_interface<bp::uint_parser<uint32_t, 16, 4, 4>> const hex_4_def;

    // We use > here instead of >>, because once we see \u, we know that
    // exactly four hex digits must follow -- no other production rule starts
    // with \u.
    auto const escape_seq_def = "\\u" > hex_4;

    // This uses the actions above and the simpler rule escape_seq to find
    // matched UTF-16 surrogate pairs.
    auto const escape_double_seq_def =
        escape_seq[first_hex_escape] >> escape_seq[second_hex_escape];

    // This symbol table recognizes each character that can appear right after
    // an escaping backslash, and, if it finds one, produces the associated
    // code point as its attribute.
    bp::symbols<uint32_t> const single_escaped_char_def = {
        {"\"", 0x0022u},
        {"\\", 0x005cu},
        {"/", 0x002fu},
        {"b", 0x0008u},
        {"f", 0x000cu},
        {"n", 0x000au},
        {"r", 0x000du},
        {"t", 0x0009u}};

    // A string may be a matched UTF-16 escaped surrogate pair, a single
    // escaped UTF-16 code unit treated as a whole code point, a single
    // escaped character like \f, or any other code point outside the range
    // [0x0000u, 0x001fu].  Note that we had to put escape_double_seq before
    // escape_seq.  Otherwise, escape_seq would eat all the escape sequences
    // before escape_double_seq could try to match them.
    auto const string_char_def = escape_double_seq | escape_seq |
                                 ('\\'_l > single_escaped_char) |
                                 (bp::cp - bp::char_(0x0000u, 0x001fu));

    // If we see the special token null, treat that as a default-constructed
    // json::value.  Note that we could have done this with a semantic action,
    // but it is best to do everything you can without semantic actions;
    // they're a lot of code.
    auto const null_def = "null" >> bp::attr(value());

    auto const string_def = bp::lexeme['"' >> *(string_char - '"') > '"'];

    // Since the JSON format for numbers is not exactly what
    // boost::parser::double_ accepts (double_ accepts too much), we need to
    // parse a JSON number as a sequence of characters, and then pass the
    // result to double_ to actually get the numeric value.  This action does
    // that.  The parser uses boost::parser::raw to produce the subrange of
    // the input that covers the number as an attribute, which is used here.
    auto parse_double = [](auto & ctx) {
        auto const cp_range = _attr(ctx);
        auto cp_first = cp_range.begin();
        auto const cp_last = cp_range.end();

        auto const result = bp::prefix_parse(cp_first, cp_last, bp::double_);
        if (result) {
            _val(ctx) = *result;
        } else {
            // This would be more efficient if we used
            // boost::container::small_vector, or std::inplace_vector from
            // C++26.
            std::vector<char> chars(cp_first, cp_last);
            auto const chars_first = &*chars.begin();
            auto chars_last = chars_first + chars.size();
            _val(ctx) = std::strtod(chars_first, &chars_last);
        }
    };

    // As indicated above, we want to match the specific formats JSON allows,
    // and then re-parse the resulting matched range within the semantic
    // action.
    auto const number_def =
        bp::raw[bp::lexeme
                    [-bp::char_('-') >>
                     (bp::char_('1', '9') >> *bp::digit | bp::char_('0')) >>
                     -(bp::char_('.') >> +bp::digit) >>
                     -(bp::char_("eE") >> -bp::char_("+-") >> +bp::digit)]]
               [parse_double];

    // Note how, in the next three parsers, we turn off backtracking by using
    // > instead of >>, once we know that there is no backtracking alternative
    // that might match if we fail to match the next element.  This produces
    // much better error messages than if you always use >>.

    auto const object_element_def = string > ':' > value_p;

    auto const object_p_def = '{'_l[object_init] >>
                              -(object_element[object_insert] % ',') > '}';

    auto const array_p_def = '['_l[array_init] >>
                             -(value_p[array_append] % ',') > ']';

    // This is the top-level parser.
    auto const value_p_def =
        number | bp::bool_ | null | string | array_p | object_p;

    // Here, we define all the rules we've declared above, which also connects
    // each rule to its _def-suffixed parser.
    BOOST_PARSER_DEFINE_RULES(
        ws,
        hex_4,
        escape_seq,
        escape_double_seq,
        single_escaped_char,
        string_char,
        null,
        string,
        number,
        object_element,
        object_p,
        array_p,
        value_p);

    // json::parse() takes a string_view as input.  It takes an optional
    // callback to use for error reporting, which defaults to a no-op that
    // ignores all errors.  It also takes an optional max recursion depth
    // limit, which defaults to the one from the JSON spec, 512.
    std::optional<value> parse(
        std::string_view str,
        diagnostic_function errors_callback = diagnostic_function(),
        int max_recursion = 512)
    {
        // Turn the input range into a UTF-32 range, so that we can be sure
        // that we fall into the Unicode-aware parsing path inside parse()
        // below.
        auto const range = boost::parser::as_utf32(str);
        using iter_t = decltype(range.begin());

        if (max_recursion <= 0)
            max_recursion = INT_MAX;

        // Initialize our globals to the current depth (0), and the max depth
        // (max_recursion).
        global_state globals{0, max_recursion};
        bp::callback_error_handler error_handler(errors_callback);
        // Make a new parser that includes the globals and error handler.
        auto const parser = bp::with_error_handler(
            bp::with_globals(value_p, globals), error_handler);

        try {
            // Parse.  If no exception is thrown, due to: a failed expectation
            // (such as foo > bar, where foo matches the input, but then bar
            // cannot); or because the nesting depth is exceeded; we simply
            // return the result of the parse.  The result will contextually
            // convert to false if the parse failed.  Note that the
            // failed-expectation exception is caught internally, and used to
            // generate an error message.
            return bp::parse(range, parser, ws);
        } catch (excessive_nesting<iter_t> const & e) {
            // If we catch an excessive_nesting exception, just report it
            // and return an empty/failure result.
            if (errors_callback) {
                std::string const message = "error: Exceeded maximum number (" +
                                            std::to_string(max_recursion) +
                                            ") of open arrays and/or objects";
                std::stringstream ss;
                bp::write_formatted_message(
                    ss, "", range.begin(), e.iter, range.end(), message);
                errors_callback(ss.str());
            }
        }

        return {};
    }

}

std::string file_slurp(std::ifstream & ifs)
{
    std::string retval;
    while (ifs) {
        char const c = ifs.get();
        retval += c;
    }
    if (!retval.empty() && retval.back() == -1)
        retval.pop_back();
    return retval;
}

int main(int argc, char * argv[])
{
    if (argc < 2) {
        std::cerr << "A filename to parse is required.\n";
        exit(1);
    }

    std::ifstream ifs(argv[1]);
    if (!ifs) {
        std::cerr << "Unable to read file '" << argv[1] << "'.\n";
        exit(1);
    }

    // Read in the entire file.
    std::string const file_contents = file_slurp(ifs);
    // Parse the contents.  If there is an error, just stream it to cerr.
    auto json = json::parse(
        file_contents, [](std::string const & msg) { std::cerr << msg; });
    if (!json) {
        std::cerr << "Parse failure.\n";
        exit(1);
    }

    std::cout << "Parse successful; contents:\n" << *json << "\n";

    return 0;
}


PrevUpHomeNext