Commit 0f07a5d5 authored by Markus Armbruster's avatar Markus Armbruster
Browse files

json: Nicer recovery from lexical errors



When the lexer chokes on an input character, it consumes the
character, emits a JSON error token, and enters its start state.  This
can lead to suboptimal error recovery.  For instance, input

    0123 ,

produces the tokens

    JSON_ERROR    01
    JSON_INTEGER  23
    JSON_COMMA    ,

Make the lexer skip characters after a lexical error until a
structural character ('[', ']', '{', '}', ':', ','), an ASCII control
character, or '\xFE', or '\xFF'.

Note that we must not skip ASCII control characters, '\xFE', '\xFF',
because those are documented to force the JSON parser into known-good
state, by docs/interop/qmp-spec.txt.

The lexer now produces

    JSON_ERROR    01
    JSON_COMMA    ,

Update qmp-test for the nicer error recovery: QMP now reports just one
error for input %p instead of two.  Also drop the newline after %p; it
was needed to tease out the second error.

Signed-off-by: default avatarMarkus Armbruster <armbru@redhat.com>
Reviewed-by: default avatarEric Blake <eblake@redhat.com>
Message-Id: <20180831075841.13363-5-armbru@redhat.com>
[Conflict with commit ebb4d82d resolved]
parent c0ee3afa
Loading
Loading
Loading
Loading
+29 −14
Original line number Diff line number Diff line
@@ -101,6 +101,7 @@

enum json_lexer_state {
    IN_ERROR = 0,               /* must really be 0, see json_lexer[] */
    IN_RECOVERY,
    IN_DQ_STRING_ESCAPE,
    IN_DQ_STRING,
    IN_SQ_STRING_ESCAPE,
@@ -130,6 +131,28 @@ QEMU_BUILD_BUG_ON(IN_START_INTERP != IN_START + 1);
static const uint8_t json_lexer[][256] =  {
    /* Relies on default initialization to IN_ERROR! */

    /* error recovery */
    [IN_RECOVERY] = {
        /*
         * Skip characters until a structural character, an ASCII
         * control character other than '\t', or impossible UTF-8
         * bytes '\xFE', '\xFF'.  Structural characters and line
         * endings are promising resynchronization points.  Clients
         * may use the others to force the JSON parser into known-good
         * state; see docs/interop/qmp-spec.txt.
         */
        [0 ... 0x1F] = IN_START | LOOKAHEAD,
        [0x20 ... 0xFD] = IN_RECOVERY,
        [0xFE ... 0xFF] = IN_START | LOOKAHEAD,
        ['\t'] = IN_RECOVERY,
        ['['] = IN_START | LOOKAHEAD,
        [']'] = IN_START | LOOKAHEAD,
        ['{'] = IN_START | LOOKAHEAD,
        ['}'] = IN_START | LOOKAHEAD,
        [':'] = IN_START | LOOKAHEAD,
        [','] = IN_START | LOOKAHEAD,
    },

    /* double quote string */
    [IN_DQ_STRING_ESCAPE] = {
        [0x20 ... 0xFD] = IN_DQ_STRING,
@@ -301,26 +324,18 @@ static void json_lexer_feed_char(JSONLexer *lexer, char ch, bool flush)
            /* fall through */
        case JSON_SKIP:
            g_string_truncate(lexer->token, 0);
            /* fall through */
        case IN_START:
            new_state = lexer->start_state;
            break;
        case IN_ERROR:
            /* XXX: To avoid having previous bad input leaving the parser in an
             * unresponsive state where we consume unpredictable amounts of
             * subsequent "good" input, percolate this error state up to the
             * parser by emitting a JSON_ERROR token, then reset lexer state.
             *
             * Also note that this handling is required for reliable channel
             * negotiation between QMP and the guest agent, since chr(0xFF)
             * is placed at the beginning of certain events to ensure proper
             * delivery when the channel is in an unknown state. chr(0xFF) is
             * never a valid ASCII/UTF-8 sequence, so this should reliably
             * induce an error/flush state.
             */
            json_message_process_token(lexer, lexer->token, JSON_ERROR,
                                       lexer->x, lexer->y);
            new_state = IN_RECOVERY;
            /* fall through */
        case IN_RECOVERY:
            g_string_truncate(lexer->token, 0);
            lexer->state = lexer->start_state;
            return;
            break;
        default:
            break;
        }
+1 −4
Original line number Diff line number Diff line
@@ -76,10 +76,7 @@ static void test_malformed(QTestState *qts)
    assert_recovered(qts);

    /* lexical error: interpolation */
    qtest_qmp_send_raw(qts, "%%p\n");
    /* two errors, one for "%", one for "p" */
    resp = qtest_qmp_receive(qts);
    qmp_assert_error_class(resp, "GenericError");
    qtest_qmp_send_raw(qts, "%%p");
    resp = qtest_qmp_receive(qts);
    qmp_assert_error_class(resp, "GenericError");
    assert_recovered(qts);