Browse Source

Support most of the latest Turtle Editor's Draft.

git-svn-id: http://svn.drobilla.net/serd/trunk@418 490d8e77-9747-427b-9fa3-0b8f29cee8a0
zrythm_meson
David Robillard 10 years ago
parent
commit
4cd874e79f
  1. 3
      NEWS
  2. 12
      serd.ttl
  3. 631
      src/reader.c
  4. 210
      src/writer.c
  5. 10
      tests/good/test-07.nt
  6. 1
      tests/good/test-29.nt
  7. 1
      tests/good/test-29.ttl
  8. 4
      tests/good/test-backspace.nt
  9. 6
      tests/good/test-blank-in-list.nt
  10. 2
      tests/good/test-escapes.ttl
  11. 10
      tests/good/test-list-in-blank.nt
  12. 10
      tests/good/test-list-subject.nt
  13. 64
      tests/good/test-pretty.nt
  14. 1
      tests/good/test-uri-escape.nt
  15. 1
      tests/good/test-uri-escape.ttl
  16. 1
      tests/new/HYPHEN_MINUS_in_local_name.nt
  17. 2
      tests/new/HYPHEN_MINUS_in_local_name.ttl
  18. 1
      tests/new/IRIREF_datatype.nt
  19. 1
      tests/new/IRIREF_datatype.ttl
  20. 1
      tests/new/IRI_subject.nt
  21. 1
      tests/new/IRI_subject.ttl
  22. 1
      tests/new/IRI_with_all_punctuation.nt
  23. 1
      tests/new/IRI_with_all_punctuation.ttl
  24. 1
      tests/new/IRI_with_eight_digit_numeric_escape.ttl
  25. 1
      tests/new/IRI_with_four_digit_numeric_escape.ttl
  26. 1
      tests/new/LITERAL1.nt
  27. 1
      tests/new/LITERAL1.ttl
  28. 1
      tests/new/LITERAL2.ttl
  29. 1
      tests/new/LITERAL_LONG1.ttl
  30. 1
      tests/new/LITERAL_LONG1_with_1_squote.nt
  31. 1
      tests/new/LITERAL_LONG1_with_1_squote.ttl
  32. 1
      tests/new/LITERAL_LONG1_with_2_squotes.nt
  33. 1
      tests/new/LITERAL_LONG1_with_2_squotes.ttl
  34. 1
      tests/new/LITERAL_LONG2.ttl
  35. 1
      tests/new/LITERAL_LONG2_with_1_squote.nt
  36. 1
      tests/new/LITERAL_LONG2_with_1_squote.ttl
  37. 1
      tests/new/LITERAL_LONG2_with_2_squotes.nt
  38. 1
      tests/new/LITERAL_LONG2_with_2_squotes.ttl
  39. 2
      tests/new/SPARQL_style_base.ttl
  40. 2
      tests/new/SPARQL_style_prefix.ttl
  41. 1
      tests/new/anonymous_blank_node_object.nt
  42. 1
      tests/new/anonymous_blank_node_object.ttl
  43. 1
      tests/new/anonymous_blank_node_subject.ttl
  44. 1
      tests/new/bareword_a_predicate.nt
  45. 1
      tests/new/bareword_a_predicate.ttl
  46. 1
      tests/new/bareword_decimal.nt
  47. 1
      tests/new/bareword_decimal.ttl
  48. 1
      tests/new/bareword_double.nt
  49. 1
      tests/new/bareword_double.ttl
  50. 1
      tests/new/bareword_integer.ttl
  51. 2
      tests/new/blankNodePropertyList_as_object.nt
  52. 1
      tests/new/blankNodePropertyList_as_object.ttl
  53. 2
      tests/new/blankNodePropertyList_as_subject.nt
  54. 1
      tests/new/blankNodePropertyList_as_subject.ttl
  55. 3
      tests/new/blankNodePropertyList_containing_collection.nt
  56. 1
      tests/new/blankNodePropertyList_containing_collection.ttl
  57. 3
      tests/new/blankNodePropertyList_with_multiple_triples.nt
  58. 1
      tests/new/blankNodePropertyList_with_multiple_triples.ttl
  59. 3
      tests/new/collection_object.nt
  60. 1
      tests/new/collection_object.ttl
  61. 3
      tests/new/collection_subject.nt
  62. 1
      tests/new/collection_subject.ttl
  63. 2
      tests/new/default_namespace_IRI.ttl
  64. 1
      tests/new/double_lower_case_e.nt
  65. 1
      tests/new/double_lower_case_e.ttl
  66. 1
      tests/new/empty_collection.nt
  67. 1
      tests/new/empty_collection.ttl
  68. 7
      tests/new/first.nt
  69. 1
      tests/new/first.ttl
  70. 1
      tests/new/labeled_blank_node_object.nt
  71. 1
      tests/new/labeled_blank_node_object.ttl
  72. 1
      tests/new/labeled_blank_node_subject.nt
  73. 1
      tests/new/labeled_blank_node_subject.ttl
  74. 1
      tests/new/langtagged_LONG.ttl
  75. 1
      tests/new/langtagged_non_LONG.nt
  76. 1
      tests/new/langtagged_non_LONG.ttl
  77. 1
      tests/new/lantag_with_subtag.nt
  78. 1
      tests/new/lantag_with_subtag.ttl
  79. 7
      tests/new/last.nt
  80. 1
      tests/new/last.ttl
  81. 1
      tests/new/literal_false.nt
  82. 1
      tests/new/literal_false.ttl
  83. 1
      tests/new/literal_true.nt
  84. 1
      tests/new/literal_true.ttl
  85. 1
      tests/new/literal_with_BACKSPACE.nt
  86. 1
      tests/new/literal_with_BACKSPACE.ttl
  87. 1
      tests/new/literal_with_CARRIAGE_RETURN.nt
  88. 1
      tests/new/literal_with_CARRIAGE_RETURN.ttl
  89. 1
      tests/new/literal_with_CHARACTER_TABULATION.nt
  90. 1
      tests/new/literal_with_CHARACTER_TABULATION.ttl
  91. 1
      tests/new/literal_with_FORM_FEED.nt
  92. 1
      tests/new/literal_with_FORM_FEED.ttl
  93. 1
      tests/new/literal_with_LINE_FEED.nt
  94. 2
      tests/new/literal_with_LINE_FEED.ttl
  95. 1
      tests/new/literal_with_REVERSE_SOLIDUS.nt
  96. 1
      tests/new/literal_with_REVERSE_SOLIDUS.ttl
  97. 1
      tests/new/literal_with_escaped_BACKSPACE.ttl
  98. 1
      tests/new/literal_with_escaped_CARRIAGE_RETURN.ttl
  99. 1
      tests/new/literal_with_escaped_CHARACTER_TABULATION.ttl
  100. 1
      tests/new/literal_with_escaped_FORM_FEED.ttl
  101. Some files were not shown because too many files have changed in this diff Show More

3
NEWS

@ -1,7 +1,8 @@ @@ -1,7 +1,8 @@
serd (0.18.3) unstable;
* Support most of the latest Turtle Editor's Draft
* Fix possible crash in serd_writer_end_anon() when writing invalid lists
* Generate blank names like _:b1 _:B2 _:el3, not _:genid1 _:docid2 _:genid3
* Generate blank names like _:b1 and _:B2 not _:genid1 _:docid2
* Correctly handle posix_memalign failure
-- David Robillard <d@drobilla.net> Sun, 24 Feb 2013 02:05:30 -0500

12
serd.ttl

@ -1,5 +1,11 @@ @@ -1,5 +1,11 @@
@prefix doap: <http://usefulinc.com/ns/doap#> .
<http://drobilla.net/drobilla#me>
a foaf:Person ;
foaf:name "David Robillard" ;
foaf:mbox <mailto:d@drobilla.net> ;
rdfs:seeAlso <http://drobilla.net/drobilla> .
<http://drobilla.net/sw/serd>
a doap:Project ;
doap:name "Serd" ;
@ -13,4 +19,8 @@ @@ -13,4 +19,8 @@
doap:bug-database <http://dev.drobilla.net/> ;
doap:blog <http://drobilla.net/> ;
doap:developer <http://drobilla.net/drobilla#me> ;
doap:maintainer <http://drobilla.net/drobilla#me> .
doap:maintainer <http://drobilla.net/drobilla#me> ;
doap:repository [
a doap:SVNRepository ;
doap:location <http://svn.drobilla.net/serd/>
] .

631
src/reader.c

@ -267,12 +267,11 @@ read_collection(SerdReader* reader, ReadContext ctx, Ref* dest); @@ -267,12 +267,11 @@ read_collection(SerdReader* reader, ReadContext ctx, Ref* dest);
static bool
read_predicateObjectList(SerdReader* reader, ReadContext ctx);
// [40] hex ::= [#x30-#x39] | [#x41-#x46]
static inline uint8_t
read_hex(SerdReader* reader)
read_HEX(SerdReader* reader)
{
const uint8_t c = peek_byte(reader);
if (in_range(c, 0x30, 0x39) || in_range(c, 0x41, 0x46)) {
if (is_digit(c) || in_range(c, 'A', 'F') || in_range(c, 'a', 'f')) {
return eat_byte_safe(reader, c);
} else {
return r_err(reader, SERD_ERR_BAD_SYNTAX,
@ -280,12 +279,27 @@ read_hex(SerdReader* reader) @@ -280,12 +279,27 @@ read_hex(SerdReader* reader)
}
}
// Read UCHAR escape, initial \ is already eaten by caller
static inline bool
read_hex_escape(SerdReader* reader, unsigned length, Ref dest)
read_UCHAR(SerdReader* reader, Ref dest)
{
const uint8_t b = peek_byte(reader);
unsigned length = 0;
switch (b) {
case 'U':
length = 8;
break;
case 'u':
length = 4;
break;
default:
return false;
}
eat_byte_safe(reader, b);
uint8_t buf[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
for (unsigned i = 0; i < length; ++i) {
if (!(buf[i] = read_hex(reader))) {
if (!(buf[i] = read_HEX(reader))) {
return false;
}
}
@ -334,32 +348,20 @@ read_hex_escape(SerdReader* reader, unsigned length, Ref dest) @@ -334,32 +348,20 @@ read_hex_escape(SerdReader* reader, unsigned length, Ref dest)
return true;
}
// Read ECHAR escape, initial \ is already eaten by caller
static inline bool
read_character_escape(SerdReader* reader, Ref dest)
{
switch (peek_byte(reader)) {
case '\\':
push_byte(reader, dest, eat_byte_safe(reader, '\\'));
return true;
case 'u':
eat_byte_safe(reader, 'u');
return read_hex_escape(reader, 4, dest);
case 'U':
eat_byte_safe(reader, 'U');
return read_hex_escape(reader, 8, dest);
default:
return false;
}
}
static inline bool
read_echaracter_escape(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
read_ECHAR(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
{
switch (peek_byte(reader)) {
const uint8_t c = peek_byte(reader);
switch (c) {
case 't':
eat_byte_safe(reader, 't');
push_byte(reader, dest, '\t');
return true;
case 'b':
eat_byte_safe(reader, 'b');
push_byte(reader, dest, '\b');
return true;
case 'n':
*flags |= SERD_HAS_NEWLINE;
eat_byte_safe(reader, 'n');
@ -370,34 +372,15 @@ read_echaracter_escape(SerdReader* reader, Ref dest, SerdNodeFlags* flags) @@ -370,34 +372,15 @@ read_echaracter_escape(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
eat_byte_safe(reader, 'r');
push_byte(reader, dest, '\r');
return true;
default:
return read_character_escape(reader, dest);
}
}
static inline bool
read_scharacter_escape(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
{
switch (peek_byte(reader)) {
case '"':
*flags |= SERD_HAS_QUOTE;
push_byte(reader, dest, eat_byte_safe(reader, '"'));
case 'f':
eat_byte_safe(reader, 'f');
push_byte(reader, dest, '\f');
return true;
default:
return read_echaracter_escape(reader, dest, flags);
}
}
static inline bool
read_ucharacter_escape(SerdReader* reader, Ref dest)
{
SerdNodeFlags flags = 0;
switch (peek_byte(reader)) {
case '>':
push_byte(reader, dest, eat_byte_safe(reader, '>'));
case '\\': case '"': case '\'':
push_byte(reader, dest, eat_byte_safe(reader, c));
return true;
default:
return read_echaracter_escape(reader, dest, &flags);
return false;
}
}
@ -427,12 +410,11 @@ read_utf8_character(SerdReader* reader, Ref dest, uint8_t c) @@ -427,12 +410,11 @@ read_utf8_character(SerdReader* reader, Ref dest, uint8_t c)
} else if ((c & 0xF8) == 0xF0) { // Starts with `11110'
size = 4;
} else {
return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n",
eat_byte_safe(reader, c));
return bad_char(reader, dest, "invalid UTF-8 start 0x%X\n", c);
}
char bytes[4];
bytes[0] = eat_byte_safe(reader, c);
bytes[0] = c;
// Check character validity
for (unsigned i = 1; i < size; ++i) {
@ -450,114 +432,19 @@ read_utf8_character(SerdReader* reader, Ref dest, uint8_t c) @@ -450,114 +432,19 @@ read_utf8_character(SerdReader* reader, Ref dest, uint8_t c)
return SERD_SUCCESS;
}
// [38] character ::= '\u' hex hex hex hex
// | '\U' hex hex hex hex hex hex hex hex
// | '\\'
// | [#x20-#x5B] | [#x5D-#x10FFFF]
// Read one character (possibly multi-byte)
// The first byte, c, has already been eaten by caller
static inline SerdStatus
read_character(SerdReader* reader, Ref dest)
read_character(SerdReader* reader, Ref dest, uint8_t c)
{
const uint8_t c = peek_byte(reader);
assert(c != '\\'); // Only called from methods that handle escapes first
if (c == '\0') {
r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected end of input\n", c);
return SERD_ERR_BAD_SYNTAX;
} else if (c < 0x20) {
return bad_char(reader, dest,
"unexpected control character 0x%X\n",
eat_byte_safe(reader, c));
} else if (!(c & 0x80)) {
push_byte(reader, dest, eat_byte_safe(reader, c));
if (!(c & 0x80)) {
push_byte(reader, dest, c);
return SERD_SUCCESS;
} else {
return read_utf8_character(reader, dest, c);
}
}
// [43] lcharacter ::= echaracter | '\"' | #x9 | #xA | #xD
static inline SerdStatus
read_lcharacter(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
{
const uint8_t c = peek_byte(reader);
uint8_t buf[2];
switch (c) {
case '"':
eat_byte_safe(reader, '\"');
buf[0] = eat_byte_safe(reader, peek_byte(reader));
buf[1] = eat_byte_safe(reader, peek_byte(reader));
if (buf[0] == '\"' && buf[1] == '\"') {
return SERD_FAILURE;
} else {
*flags |= SERD_HAS_QUOTE;
push_byte(reader, dest, c);
push_byte(reader, dest, buf[0]);
push_byte(reader, dest, buf[1]);
return SERD_SUCCESS;
}
case '\\':
eat_byte_safe(reader, '\\');
if (read_scharacter_escape(reader, dest, flags)) {
return SERD_SUCCESS;
} else {
r_err(reader, SERD_ERR_BAD_SYNTAX,
"invalid escape `\\%c'\n", peek_byte(reader));
return SERD_ERR_BAD_SYNTAX;
}
case 0xA: case 0xD:
*flags |= SERD_HAS_NEWLINE;
case 0x9:
push_byte(reader, dest, eat_byte_safe(reader, c));
return SERD_SUCCESS;
default:
return read_character(reader, dest);
}
}
// [42] scharacter ::= ( echaracter - #x22 ) | '\"'
static inline SerdStatus
read_scharacter(SerdReader* reader, Ref dest, SerdNodeFlags* flags)
{
uint8_t c = peek_byte(reader);
switch (c) {
case '\\':
eat_byte_safe(reader, '\\');
if (read_scharacter_escape(reader, dest, flags)) {
return SERD_SUCCESS;
} else {
r_err(reader, SERD_ERR_BAD_SYNTAX,
"invalid escape `\\%c'\n", peek_byte(reader));
return SERD_ERR_BAD_SYNTAX;
}
case '\"':
return SERD_FAILURE;
default:
return read_character(reader, dest);
}
}
// Spec: [41] ucharacter ::= ( character - #x3E ) | '\>'
// Impl: [41] ucharacter ::= ( echaracter - #x3E ) | '\>'
static inline SerdStatus
read_ucharacter(SerdReader* reader, Ref dest)
{
const uint8_t c = peek_byte(reader);
switch (c) {
case '\\':
eat_byte_safe(reader, '\\');
if (read_ucharacter_escape(reader, dest)) {
return SERD_SUCCESS;
} else {
r_err(reader, SERD_ERR_BAD_SYNTAX,
"invalid escape `\\%c'\n", peek_byte(reader));
return SERD_FAILURE;
}
case '>':
return SERD_FAILURE;
default:
return read_character(reader, dest);
}
}
// [10] comment ::= '#' ( [^#xA #xD] )*
static void
read_comment(SerdReader* reader)
@ -617,131 +504,211 @@ eat_delim(SerdReader* reader, const char delim) @@ -617,131 +504,211 @@ eat_delim(SerdReader* reader, const char delim)
return false;
}
// [37] longString ::= #x22 #x22 #x22 lcharacter* #x22 #x22 #x22
// STRING_LITERAL_LONG_QUOTE and STRING_LITERAL_LONG_SINGLE_QUOTE
// Initial triple quotes are already eaten by caller
static Ref
read_longString(SerdReader* reader, SerdNodeFlags* flags)
read_STRING_LITERAL_LONG(SerdReader* reader, SerdNodeFlags* flags, uint8_t q)
{
Ref ref = push_node(reader, SERD_LITERAL, "", 0);
SerdStatus st;
while (!(st = read_lcharacter(reader, ref, flags))) {}
if (st < SERD_ERR_UNKNOWN) {
return ref;
Ref ref = push_node(reader, SERD_LITERAL, "", 0);
while (true) {
const uint8_t c = peek_byte(reader);
switch (c) {
case '\\':
eat_byte_safe(reader, c);
if (!read_ECHAR(reader, ref, flags) && !read_UCHAR(reader, ref)) {
r_err(reader, SERD_ERR_BAD_SYNTAX,
"invalid escape `\\%c'\n", peek_byte(reader));
return pop_node(reader, ref);
}
break;
default:
if (c == q) {
eat_byte_safe(reader, q);
const uint8_t q2 = eat_byte_safe(reader, peek_byte(reader));
const uint8_t q3 = peek_byte(reader);
if (q2 == q && q3 == q) { // End of string
eat_byte_safe(reader, q3);
return ref;
} else {
*flags |= SERD_HAS_QUOTE;
push_byte(reader, ref, c);
read_character(reader, ref, q2);
}
} else {
read_character(reader, ref, eat_byte_safe(reader, c));
}
}
}
return pop_node(reader, ref);
return ref;
}
// [36] string ::= #x22 scharacter* #x22
// STRING_LITERAL_QUOTE and STRING_LITERAL_SINGLE_QUOTE
// Initial quote is already eaten by caller
static Ref
read_string(SerdReader* reader, SerdNodeFlags* flags)
read_STRING_LITERAL(SerdReader* reader, SerdNodeFlags* flags, uint8_t q)
{
Ref ref = push_node(reader, SERD_LITERAL, "", 0);
SerdStatus st;
while (!(st = read_scharacter(reader, ref, flags))) {}
if (st < SERD_ERR_UNKNOWN) {
eat_byte_check(reader, '\"');
return ref;
Ref ref = push_node(reader, SERD_LITERAL, "", 0);
while (true) {
const uint8_t c = peek_byte(reader);
switch (c) {
case '\n': case '\r':
r_err(reader, SERD_ERR_BAD_SYNTAX, "line end in short string\n");
return pop_node(reader, ref);
case '\\':
eat_byte_safe(reader, c);
if (!read_ECHAR(reader, ref, flags) && !read_UCHAR(reader, ref)) {
r_err(reader, SERD_ERR_BAD_SYNTAX,
"invalid escape `\\%c'\n", peek_byte(reader));
return pop_node(reader, ref);
}
break;
default:
if (c == q) {
eat_byte_check(reader, q);
return ref;
} else {
read_character(reader, ref, eat_byte_safe(reader, c));
}
}
}
return pop_node(reader, ref);
eat_byte_check(reader, q);
return ref;
}
// [35] quotedString ::= string | longString
static Ref
read_quotedString(SerdReader* reader, SerdNodeFlags* flags)
read_String(SerdReader* reader, SerdNodeFlags* flags)
{
eat_byte_safe(reader, '\"'); // q1
const uint8_t q1 = peek_byte(reader);
if (q1 != '\"' && q1 != '\'') {
return 0;
}
eat_byte_safe(reader, q1);
const uint8_t q2 = peek_byte(reader);
if (q2 != '\"') { // Non-empty single-quoted string
return read_string(reader, flags);
if (q2 != q1) { // Short string (not triple quoted)
return read_STRING_LITERAL(reader, flags, q1);
}
eat_byte_safe(reader, q2);
const uint8_t q3 = peek_byte(reader);
if (q3 != '\"') { // Empty single-quoted string
if (q3 != q1) { // Empty short string ("" or '')
return push_node(reader, SERD_LITERAL, "", 0);
}
eat_byte_safe(reader, '\"');
return read_longString(reader, flags);
eat_byte_safe(reader, q3);
return read_STRING_LITERAL_LONG(reader, flags, q1);
}
// [34] relativeURI ::= ucharacter*
static inline Ref
read_relativeURI(SerdReader* reader)
static bool
read_PN_CHARS_BASE(SerdReader* reader, Ref dest)
{
Ref ref = push_node(reader, SERD_URI, "", 0);
SerdStatus st;
while (!(st = read_ucharacter(reader, ref))) {}
if (st < SERD_ERR_UNKNOWN) {
return ref;
const uint8_t c = peek_byte(reader);
if (is_alpha(c)) { // TODO: UTF-8
push_byte(reader, dest, eat_byte_safe(reader, c));
return true;
}
return pop_node(reader, ref);
return false;
}
// [30] nameStartChar ::= [A-Z] | "_" | [a-z]
// | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D]
// | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF]
// | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
static inline uchar
read_nameStartChar(SerdReader* reader)
static bool
read_PN_CHARS(SerdReader* reader, Ref dest)
{
const uint8_t c = peek_byte(reader);
if (c == '_' || is_alpha(c) || is_digit(c)) { // TODO: Not correct
return eat_byte_safe(reader, c);
if (is_alpha(c) || is_digit(c) || c == '_' || c == '-') { // TODO: UTF-8
push_byte(reader, dest, eat_byte_safe(reader, c));
return true;
}
return 0;
return false;
}
// [31] nameChar ::= nameStartChar | '-' | [0-9]
// | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
static inline uchar
read_nameChar(SerdReader* reader)
static bool
read_PERCENT(SerdReader* reader, Ref dest)
{
push_byte(reader, dest, eat_byte_safe(reader, '%'));
const uint8_t h1 = read_HEX(reader);
const uint8_t h2 = read_HEX(reader);
if (h1 && h2) {
push_byte(reader, dest, h1);
push_byte(reader, dest, h2);
return true;
}
return false;
}
static bool
read_PLX(SerdReader* reader, Ref dest)
{
uchar c = read_nameStartChar(reader);
if (c)
return c;
uint8_t c = peek_byte(reader);
switch (c) {
case '%':
return read_PERCENT(reader, dest);
case '\\':
eat_byte_safe(reader, c);
c = peek_byte(reader);
push_byte(reader, dest, eat_byte_safe(reader, c));
return true;
}
return false;
}
switch ((c = peek_byte(reader))) {
case '-': case 0xB7: case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return eat_byte_safe(reader, c);
default: // TODO: 0x300-0x036F | 0x203F-0x2040
return 0;
static bool
read_PN_LOCAL(SerdReader* reader, Ref dest)
{
uint8_t c = peek_byte(reader);
if (is_digit(c) || c == ':' || c == '_') {
push_byte(reader, dest, eat_byte_safe(reader, c));
} else if (!read_PLX(reader, dest) && !read_PN_CHARS(reader, dest)) {
return false;
}
return 0;
while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')*
if (/*c == '.' || */c == ':') {
push_byte(reader, dest, eat_byte_safe(reader, c));
} else if (!read_PLX(reader, dest) && !read_PN_CHARS(reader, dest)) {
break;
}
}
return dest;
}
// [33] prefixName ::= ( nameStartChar - '_' ) nameChar*
static Ref
read_prefixName(SerdReader* reader, Ref dest)
read_PN_PREFIX(SerdReader* reader, Ref dest)
{
uint8_t c = peek_byte(reader);
if (c == '_') {
r_err(reader, SERD_ERR_BAD_SYNTAX, "unexpected `_'\n");
return pop_node(reader, dest);
Ref prefix = dest ? dest : push_node(reader, SERD_CURIE, "", 0);
if (!read_PN_CHARS_BASE(reader, prefix)) { // First: PN_CHARS_BASE
if (prefix != dest) {
return pop_node(reader, prefix);
}
return dest;
}
TRY_RET(c = read_nameStartChar(reader));
if (!dest) {
dest = push_node(reader, SERD_CURIE, "", 0);
uint8_t c;
while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')*
if (c == '.') {
push_byte(reader, prefix, eat_byte_safe(reader, c));
} else if (!read_PN_CHARS(reader, prefix)) {
break;
}
}
push_byte(reader, dest, c);
while ((c = read_nameChar(reader))) {
push_byte(reader, dest, c);
if (c == '.' && !read_PN_CHARS(reader, prefix)) { // Last: PN_CHARS
return r_err(reader, SERD_ERR_BAD_SYNTAX,
"invalid prefix character\n");
}
return dest;
return prefix;
}
// [32] name ::= nameStartChar nameChar*
static Ref
read_name(SerdReader* reader, Ref dest)
read_PNAME_NS(SerdReader* reader, Ref dest)
{
uchar c = read_nameStartChar(reader);
if (!c) {
return 0;
const Ref prefix = read_PN_PREFIX(reader, dest);
if (prefix && eat_byte_check(reader, ':') != ':') {
return r_err(reader, SERD_ERR_BAD_SYNTAX, "expected `:'\n");
}
do {
push_byte(reader, dest, c);
} while ((c = read_nameChar(reader)) != 0);
return dest;
return prefix;
}
// [29] language ::= [a-z]+ ('-' [a-z0-9]+ )*
@ -767,35 +734,57 @@ read_language(SerdReader* reader) @@ -767,35 +734,57 @@ read_language(SerdReader* reader)
return ref;
}
// [28] uriref ::= '<' relativeURI '>'
static Ref
read_uriref(SerdReader* reader)
read_IRIREF(SerdReader* reader)
{
TRY_RET(eat_byte_check(reader, '<'));
Ref const str = read_relativeURI(reader);
if (str && eat_byte_check(reader, '>')) {
return str;
Ref ref = push_node(reader, SERD_URI, "", 0);
while (true) {
const uint8_t c = peek_byte(reader);
switch (c) {
case '"': case '<': case '^': case '`': case '{': case '|': case '}':
r_err(reader, SERD_ERR_BAD_SYNTAX,
"invalid IRI character `%c'\n", c);
return pop_node(reader, ref);
case '>':
eat_byte_safe(reader, c);
return ref;
case '\\':
eat_byte_safe(reader, c);
if (!read_UCHAR(reader, ref)) {
r_err(reader, SERD_ERR_BAD_SYNTAX,
"invalid IRI character `%c'\n", c);
return pop_node(reader, ref);
}
break;
default:
if (c <= 0x20) {
return pop_node(reader, ref);
} else {
push_byte(reader, ref, eat_byte_safe(reader, c));
}
}
}
return pop_node(reader, str);
}
// [27] qname ::= prefixName? ':' name?
static Ref
read_qname(SerdReader* reader, Ref dest, bool read_prefix)
read_PrefixedName(SerdReader* reader, Ref dest, bool read_prefix)
{
Ref str = 0;
if (!dest) {
dest = push_node(reader, SERD_CURIE, "", 0);
}
if (read_prefix) {
read_prefixName(reader, dest);
if (!read_PNAME_NS(reader, dest)) {
return pop_node(reader, dest);
}
push_byte(reader, dest, ':');
}
TRY_THROW(eat_byte_check(reader, ':'));
push_byte(reader, dest, ':');
str = read_name(reader, dest);
return str ? str : dest;
except:
return pop_node(reader, dest);
if (!read_PN_LOCAL(reader, dest)) {
if (!read_prefix) {
return pop_node(reader, dest);
}
}
return dest;
}
static bool
@ -841,7 +830,7 @@ read_number(SerdReader* reader, Ref* dest, Ref* datatype) @@ -841,7 +830,7 @@ read_number(SerdReader* reader, Ref* dest, Ref* datatype)
TRY_THROW(read_0_9(reader, ref, true));
} else {
// all other cases ::= ( '-' | '+' ) [0-9]+ ( . )? ( [0-9]+ )? ...
assert(is_digit(c));
TRY_THROW(is_digit(c));
read_0_9(reader, ref, true);
if ((c = peek_byte(reader)) == '.') {
has_decimal = true;
@ -858,7 +847,7 @@ read_number(SerdReader* reader, Ref* dest, Ref* datatype) @@ -858,7 +847,7 @@ read_number(SerdReader* reader, Ref* dest, Ref* datatype)
push_byte(reader, ref, eat_byte_safe(reader, c));
default: break;
}
read_0_9(reader, ref, true);
TRY_THROW(read_0_9(reader, ref, true));
*datatype = push_node(reader, SERD_URI,
XSD_DOUBLE, sizeof(XSD_DOUBLE) - 1);
} else if (has_decimal) {
@ -876,16 +865,15 @@ except: @@ -876,16 +865,15 @@ except:
return false;
}
// [25] resource ::= uriref | qname
static bool
read_resource(SerdReader* reader, Ref* dest)
read_iri(SerdReader* reader, Ref* dest)
{
switch (peek_byte(reader)) {
case '<':
*dest = read_uriref(reader);
*dest = read_IRIREF(reader);
break;
default:
*dest = read_qname(reader, 0, true);
*dest = read_PrefixedName(reader, 0, true);
}
return *dest != 0;
}
@ -894,7 +882,7 @@ static bool @@ -894,7 +882,7 @@ static bool
read_literal(SerdReader* reader, Ref* dest,
Ref* datatype, Ref* lang, SerdNodeFlags* flags)
{
Ref str = read_quotedString(reader, flags);
Ref str = read_String(reader, flags);
if (!str) {
return false;
}
@ -903,7 +891,7 @@ read_literal(SerdReader* reader, Ref* dest, @@ -903,7 +891,7 @@ read_literal(SerdReader* reader, Ref* dest,
case '^':
eat_byte_safe(reader, '^');
eat_byte_check(reader, '^');
TRY_THROW(read_resource(reader, datatype));
TRY_THROW(read_iri(reader, datatype));
break;
case '@':
eat_byte_safe(reader, '@');
@ -912,6 +900,8 @@ read_literal(SerdReader* reader, Ref* dest, @@ -912,6 +900,8 @@ read_literal(SerdReader* reader, Ref* dest,
*dest = str;
return true;
except:
pop_node(reader, *datatype);
pop_node(reader, *lang);
pop_node(reader, str);
return false;
}
@ -936,20 +926,24 @@ read_verb(SerdReader* reader, Ref* dest) @@ -936,20 +926,24 @@ read_verb(SerdReader* reader, Ref* dest)
bool ret;
switch (peek_byte(reader)) {
case '<':
ret = (*dest = read_uriref(reader));
ret = (*dest = read_IRIREF(reader));
break;
default:
/* Either a qname, or "a". Read the prefix first, and if it is in fact
"a", produce that instead.
*/
*dest = read_prefixName(reader, 0);
*dest = read_PN_PREFIX(reader, 0);
node = deref(reader, *dest);
if (node && node->n_bytes == 1 && node->buf[0] == 'a'
&& is_token_end(peek_byte(reader))) {
pop_node(reader, *dest);
ret = (*dest = push_node(reader, SERD_URI, NS_RDF "type", 47));
} else {
ret = (*dest = read_qname(reader, *dest, false));
ret = (*dest = read_PrefixedName(reader, *dest, false));
}
if (*dest && !strncmp((char*)deref(reader, *dest)->buf, "_:", 2)) {
*dest = pop_node(reader, *dest);
return false;
}
}
read_ws_star(reader);
@ -958,51 +952,71 @@ read_verb(SerdReader* reader, Ref* dest) @@ -958,51 +952,71 @@ read_verb(SerdReader* reader, Ref* dest)
// [26] nodeID ::= '_:' name
static Ref
read_nodeID(SerdReader* reader)
read_BLANK_NODE_LABEL(SerdReader* reader)
{
eat_byte_safe(reader, '_');
eat_byte_check(reader, ':');
Ref ref = push_node(reader, SERD_BLANK,
reader->bprefix ? (char*)reader->bprefix : "",
reader->bprefix_len);
if (!read_name(reader, ref)) {
return r_err(reader, SERD_ERR_BAD_SYNTAX,
"invalid character at start of name\n");
uint8_t c = peek_byte(reader); // First: (PN_CHARS | '_' | [0-9])
if (is_digit(c) || c == '_') {
push_byte(reader, ref, c);
} else if (!read_PN_CHARS(reader, ref)) {
r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid name start character\n");
return pop_node(reader, ref);
}
while ((c = peek_byte(reader))) { // Middle: (PN_CHARS | '.')*
if (c == '.') {
push_byte(reader, ref, eat_byte_safe(reader, c));
} else if (!read_PN_CHARS(reader, ref)) {
break;
}
}
if (c == '.' && !read_PN_CHARS(reader, ref)) { // Last: PN_CHARS
r_err(reader, SERD_ERR_BAD_SYNTAX, "invalid name character\n");
return pop_node(reader, ref);
}
if (reader->syntax == SERD_TURTLE) {
const char* const buf = (const char*)deref(reader, ref)->buf;
if (buf[0] == 'b' && is_digit(buf[1])) {
((char*)buf)[0] = 'B'; // Prevent clash
reader->seen_genid = true;
} else if (reader->seen_genid && buf[0] == 'B') {
r_err(reader, SERD_ERR_ID_CLASH,
"found both `b' and `B' blank IDs, prefix required\n");
return pop_node(reader, ref);
if (is_digit(buf[1])) {
if (buf[0] == 'b') {
((char*)buf)[0] = 'B'; // Prevent clash
reader->seen_genid = true;
} else if (reader->seen_genid && buf[0] == 'B') {
r_err(reader, SERD_ERR_ID_CLASH,
"found both `b' and `B' blank IDs, prefix required\n");
return pop_node(reader, ref);
}
}
}
return ref;
}
static void
set_blank_id(SerdReader* reader, Ref ref, const char* b, size_t buf_size)
set_blank_id(SerdReader* reader, Ref ref, size_t buf_size)
{
SerdNode* node = deref(reader, ref);
const char* prefix = reader->bprefix ? (const char*)reader->bprefix : "";
node->n_bytes = node->n_chars = snprintf(
(char*)node->buf, buf_size, "%s%s%u", prefix, b, reader->next_id++);
(char*)node->buf, buf_size, "%sb%u", prefix, reader->next_id++);
}
static size_t
genid_size(SerdReader* reader)
{
return reader->bprefix_len + 2 + 10 + 1; // + "el" + UINT32_MAX + \0
return reader->bprefix_len + 1 + 10 + 1; // + "b" + UINT32_MAX + \0
}
static Ref
blank_id(SerdReader* reader, const char* b)
blank_id(SerdReader* reader)
{
Ref ref = push_node_padded(reader, genid_size(reader), SERD_BLANK, "", 0);
set_blank_id(reader, ref, b, genid_size(reader));
set_blank_id(reader, ref, genid_size(reader));
return ref;
}
@ -1017,7 +1031,7 @@ read_blank(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest) @@ -1017,7 +1031,7 @@ read_blank(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest)
bool empty;
switch (peek_byte(reader)) {
case '_':
return (*dest = read_nodeID(reader));
return (*dest = read_BLANK_NODE_LABEL(reader));
case '[':
eat_byte_safe(reader, '[');
if ((empty = peek_delim(reader, ']'))) {
@ -1026,7 +1040,7 @@ read_blank(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest) @@ -1026,7 +1040,7 @@ read_blank(SerdReader* reader, ReadContext ctx, bool subject, Ref* dest)
*ctx.flags |= (subject) ? SERD_ANON_S_BEGIN : SERD_ANON_O_BEGIN;
}
*dest = blank_id(reader, "b");
*dest = blank_id(reader);
if (ctx.subject) {
TRY_RET(emit_statement(reader, ctx, *dest, 0, 0));
}
@ -1085,20 +1099,21 @@ read_object(SerdReader* reader, ReadContext ctx) @@ -1085,20 +1099,21 @@ read_object(SerdReader* reader, ReadContext ctx)
TRY_THROW(ret = read_blank(reader, ctx, false, &o));
break;
case '<': case ':':
TRY_THROW(ret = read_resource(reader, &o));
TRY_THROW(ret = read_iri(reader, &o));
break;
case '+': case '-': case '.': case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7': case '8': case '9':
TRY_THROW(ret = read_number(reader, &o, &datatype));
break;
case '\"':
case '\'':
TRY_THROW(ret = read_literal(reader, &o, &datatype, &lang, &flags));
break;
default:
/* Either a boolean literal, or a qname. Read the prefix first, and if
it is in fact a "true" or "false" literal, produce that instead.
*/
o = read_prefixName(reader, 0);
o = read_PN_PREFIX(reader, 0);
node = deref(reader, o);
if (node && is_token_end(peek_byte(reader)) &&
((node->n_bytes == 4 && !memcmp(node->buf, "true", 4))
@ -1108,7 +1123,10 @@ read_object(SerdReader* reader, ReadContext ctx) @@ -1108,7 +1123,10 @@ read_object(SerdReader* reader, ReadContext ctx)
XSD_BOOLEAN, XSD_BOOLEAN_LEN);
} else {
o = o ? o : push_node(reader, SERD_CURIE, "", 0);
o = read_qname(reader, o, false);
o = read_PrefixedName(reader, o, false);
if (!o) {
pop_node(reader, o);
}
}
ret = o;
}
@ -1152,6 +1170,8 @@ read_predicateObjectList(SerdReader* reader, ReadContext ctx) @@ -1152,6 +1170,8 @@ read_predicateObjectList(SerdReader* reader, ReadContext ctx)
ctx.predicate = pop_node(reader, ctx.predicate);
while (eat_delim(reader, ';')) {
switch (peek_byte(reader)) {
case ';':
continue;
case '.': case ']':
return true;
default:
@ -1183,7 +1203,7 @@ read_collection(SerdReader* reader, ReadContext ctx, Ref* dest) @@ -1183,7 +1203,7 @@ read_collection(SerdReader* reader, ReadContext ctx, Ref* dest)
{
eat_byte_safe(reader, '(');
bool end = peek_delim(reader, ')');
*dest = end ? reader->rdf_nil : blank_id(reader, "el");
*dest = end ? reader->rdf_nil : blank_id(reader);
if (ctx.subject) {
// subject predicate _:head
*ctx.flags |= (end ? 0 : SERD_LIST_O_BEGIN);
@ -1216,9 +1236,9 @@ read_collection(SerdReader* reader, ReadContext ctx, Ref* dest) @@ -1216,9 +1236,9 @@ read_collection(SerdReader* reader, ReadContext ctx, Ref* dest)
/* Give rest a new ID. Done as late as possible to ensure it is
used and > IDs generated by read_object above. */
if (!rest) {
rest = n2 = blank_id(reader, "el"); // First pass, push
rest = n2 = blank_id(reader); // First pass, push
} else {
set_blank_id(reader, rest, "el", genid_size(reader));
set_blank_id(reader, rest, genid_size(reader));
}
}
@ -1238,15 +1258,18 @@ read_collection(SerdReader* reader, ReadContext ctx, Ref* dest) @@ -1238,15 +1258,18 @@ read_collection(SerdReader* reader, ReadContext ctx, Ref* dest)
// [11] subject ::= resource | blank
static Ref
read_subject(SerdReader* reader, ReadContext ctx)
read_subject(SerdReader* reader, ReadContext ctx, bool* nested)
{
Ref subject = 0;
switch (peek_byte(reader)) {
case '[': case '(': case '_':
case '[': case '(':
*nested = true;
// nobreak
case '_':
read_blank(reader, ctx, true, &subject);
break;
default:
read_resource(reader, &subject);
read_iri(reader, &subject);
}
return subject;
}
@ -1256,12 +1279,19 @@ read_subject(SerdReader* reader, ReadContext ctx) @@ -1256,12 +1279,19 @@ read_subject(SerdReader* reader, ReadContext ctx)
static bool
read_triples(SerdReader* reader, ReadContext ctx)
{
const Ref subject = read_subject(reader, ctx);
bool nested = false;
const Ref subject = read_subject(reader, ctx, &nested);
bool ret = false;
if (subject) {
ctx.subject = subject;
TRY_RET(read_ws_plus(reader));
ret = read_predicateObjectList(reader, ctx);
if (nested) {
read_ws_star(reader);
read_predicateObjectList(reader, ctx);
ret = true;
} else {
TRY_RET(read_ws_plus(reader));
ret = read_predicateObjectList(reader, ctx);
}
pop_node(reader, subject);
}
ctx.subject = ctx.predicate = 0;
@ -1276,7 +1306,7 @@ read_base(SerdReader* reader) @@ -1276,7 +1306,7 @@ read_base(SerdReader* reader)
eat_string(reader, "base", 4);
TRY_RET(read_ws_plus(reader));
Ref uri;
TRY_RET(uri = read_uriref(reader));
TRY_RET(uri = read_IRIREF(reader));
if (reader->base_sink) {
reader->base_sink(reader->handle, deref(reader, uri));
}
@ -1289,26 +1319,29 @@ read_base(SerdReader* reader) @@ -1289,26 +1319,29 @@ read_base(SerdReader* reader)
static bool
read_prefixID(SerdReader* reader)
{
bool ret = true;
Ref name = 0;
Ref uri = 0;
bool ret = true;
// `@' is already eaten in read_directive
eat_string(reader, "prefix", 6);
TRY_RET(read_ws_plus(reader));
name = read_prefixName(reader, 0);
if (!name) {
name = push_node(reader, SERD_LITERAL, "", 0);
Ref name = push_node(reader, SERD_LITERAL, "", 0);
if (!read_PNAME_NS(reader, name)) {
return pop_node(reader, name);
}
TRY_THROW(eat_byte_check(reader, ':') == ':');
read_ws_star(reader);
TRY_THROW(uri = read_uriref(reader));
const Ref uri = read_IRIREF(reader);
if (!uri) {
pop_node(reader, name);
return false;
}
if (reader->prefix_sink) {
ret = !reader->prefix_sink(reader->handle,
deref(reader, name),
deref(reader, uri));
}
pop_node(reader, uri);
except:
pop_node(reader, name);
return ret;
}

210
src/writer.c

@ -145,17 +145,133 @@ sink(const void* buf, size_t len, SerdWriter* writer) @@ -145,17 +145,133 @@ sink(const void* buf, size_t len, SerdWriter* writer)
}
}
// Parse a UTF-8 character, set *size to the length, and return the code point
static inline uint32_t
parse_utf8_char(SerdWriter* writer, const uint8_t* utf8, size_t* size)
{
uint32_t c = 0;
if ((utf8[0] & 0x80) == 0) { // Starts with `0'
*size = 1;
c = utf8[0];
} else if ((utf8[0] & 0xE0) == 0xC0) { // Starts with `110'
*size = 2;
c = utf8[0] & 0x1F;
} else if ((utf8[0] & 0xF0) == 0xE0) { // Starts with `1110'
*size = 3;
c = utf8[0] & 0x0F;
} else if ((utf8[0] & 0xF8) == 0xF0) { // Starts with `11110'
*size = 4;
c = utf8[0] & 0x07;
} else {
w_err(writer, SERD_ERR_BAD_ARG, "invalid UTF-8: %X\n", utf8[0]);
*size = 0;
return 0;
}
size_t i = 0;
uint8_t in = utf8[i++];
#define READ_BYTE() \
in = utf8[i++] & 0x3f; \
c = (c << 6) | in;
switch (*size) {
case 4: READ_BYTE();
case 3: READ_BYTE();
case 2: READ_BYTE();
}
return c;
}
// Write a single character, as an escape for single byte characters
// (Caller prints any single byte characters that don't need escaping)
static size_t
write_character(SerdWriter* writer, const uint8_t* utf8, size_t* size)
{
const uint8_t replacement_char[] = { 0xEF, 0xBF, 0xBD };
char escape[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
const uint8_t in = utf8[0];
uint32_t c = parse_utf8_char(writer, utf8, size);
switch (*size) {
case 0:
w_err(writer, SERD_ERR_BAD_ARG, "invalid UTF-8: %X\n", in);
return sink(replacement_char, sizeof(replacement_char), writer);
case 1:
snprintf(escape, sizeof(escape), "\\u%04X", in);
return sink(escape, 6, writer);
default:
break;
}
if (!(writer->style & SERD_STYLE_ASCII)) {
// Write UTF-8 character directly to UTF-8 output
return sink(utf8, *size, writer);
}
if (c < 0xFFFF) {
snprintf(escape, sizeof(escape), "\\u%04X", c);
return sink(escape, 6, writer);
} else {
snprintf(escape, sizeof(escape), "\\U%08X", c);
return sink(escape, 10, writer);
}
}
static inline bool
uri_must_escape(const uint8_t c)
{
switch (c) {
case ' ': case '"': case '<': case '>': case '\\':
case '^': case '`': case '{': case '|': case '}':
return true;
default:
return !in_range(c, 0x20, 0x7E);
}
}
static size_t
write_uri(SerdWriter* writer, const uint8_t* utf8, size_t n_bytes)
{
size_t len = 0;
for (size_t i = 0; i < n_bytes;) {
size_t j = i; // Index of next character that must be escaped
for (; j < n_bytes; ++j) {
if (uri_must_escape(utf8[j])) {
break;
}
}
if (j > i) {
// Bulk write all characters up to this special one
len += sink(&utf8[i], j - i, writer);
i = j;
continue;
}
// Write UTF-8 character
size_t size = 0;
len += write_character(writer, utf8 + i, &size);
i += size;
if (size == 0) {
return len;
}
}
return len;
}
static size_t
write_text(SerdWriter* writer, TextContext ctx,
const uint8_t* utf8, size_t n_bytes)
{
size_t len = 0;
char escape[11] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
size_t len = 0;
for (size_t i = 0; i < n_bytes;) {
// Fast bulk write for long strings of printable ASCII
size_t j = i;
for (; j < n_bytes; ++j) {
if (utf8[j] == '>' || utf8[j] == '\\' || utf8[j] == '"'
if (utf8[j] == '\\' || utf8[j] == '"'
|| (!in_range(utf8[j], 0x20, 0x7E))) {
break;
}
@ -174,82 +290,27 @@ write_text(SerdWriter* writer, TextContext ctx, @@ -174,82 +290,27 @@ write_text(SerdWriter* writer, TextContext ctx,
} else if (in == '\"' && i == n_bytes) {
len += sink("\\\"", 2, writer); continue; // '"' at string end
}
} else {
} else if (ctx == WRITE_STRING) {
switch (in) {
case '\\': len += sink("\\\\", 2, writer); continue;
case '\n': len += sink("\\n", 2, writer); continue;
case '\r': len += sink("\\r", 2, writer); continue;
case '\t': len += sink("\\t", 2, writer); continue;
case '"':
if (ctx == WRITE_STRING) {
len += sink("\\\"", 2, writer);
continue;
} // else fall-through
case '\b': len += sink("\\b", 2, writer); continue;
case '\f': len += sink("\\f", 2, writer); continue;
case '"': len += sink("\\\"", 2, writer); continue;
default: break;
}
if ((ctx == WRITE_STRING && in == '"') ||
(ctx == WRITE_URI && in == '>')) {
snprintf(escape, sizeof(escape), "\\u%04X",
ctx == WRITE_STRING ? '"' : '>');
len += sink(escape, 6, writer);
continue;
}
}
uint32_t c = 0;
size_t size = 0;
if ((in & 0x80) == 0) { // Starts with `0'
c = in & 0x7F;
if (in_range(c, 0x20, 0x7E)
|| (is_space(c) && ctx == WRITE_LONG_STRING)) {
len += sink(&in, 1, writer); // Print ASCII character
} else {
snprintf(escape, sizeof(escape), "\\u%04X", c);
len += sink(escape, 6, writer); // ASCII control character
}
continue;
} else if ((in & 0xE0) == 0xC0) { // Starts with `110'
size = 2;
c = in & 0x1F;
} else if ((in & 0xF0) == 0xE0) { // Starts with `1110'
size = 3;
c = in & 0x0F;
} else if ((in & 0xF8) == 0xF0) { // Starts with `11110'
size = 4;
c = in & 0x07;
} else {
w_err(writer, SERD_ERR_BAD_ARG, "invalid UTF-8: %X\n", in);
const uint8_t replacement_char[] = { 0xEF, 0xBF, 0xBD };
len += sink(replacement_char, sizeof(replacement_char), writer);
return len;
}
if (ctx != WRITE_URI && !(writer->style & SERD_STYLE_ASCII)) {
// Write UTF-8 character directly to UTF-8 output
// TODO: Always parse and validate character?
len += sink(utf8 + i - 1, size, writer);
i += size - 1;
continue;
}
#define READ_BYTE() \
in = utf8[i++] & 0x3f; \
c = (c << 6) | in;
size_t size = 0;
len += write_character(writer, utf8 + i - 1, &size);
switch (size) {
case 4: READ_BYTE();
case 3: READ_BYTE();
case 2: READ_BYTE();
if (size == 0) {
return len;
}
if (c < 0xFFFF) {
snprintf(escape, sizeof(escape), "\\u%04X", c);
len += sink(escape, 6, writer);
} else {
snprintf(escape, sizeof(escape), "\\U%08X", c);
len += sink(escape, 10, writer);
}
i += size - 1;
}
return len;
}
@ -257,8 +318,7 @@ write_text(SerdWriter* writer, TextContext ctx, @@ -257,8 +318,7 @@ write_text(SerdWriter* writer, TextContext ctx,
static size_t
uri_sink(const void* buf, size_t len, void* stream)
{
return write_text((SerdWriter*)stream, WRITE_URI,
(const uint8_t*)buf, len);
return write_uri((SerdWriter*)stream, (const uint8_t*)buf, len);
}
static void
@ -369,8 +429,8 @@ write_node(SerdWriter* writer, @@ -369,8 +429,8 @@ write_node(SerdWriter* writer,
return false;
}
sink("<", 1, writer);
write_text(writer, WRITE_URI, uri_prefix.buf, uri_prefix.len);
write_text(writer, WRITE_URI, uri_suffix.buf, uri_suffix.len);
write_uri(writer, uri_prefix.buf, uri_prefix.len);
write_uri(writer, uri_suffix.buf, uri_suffix.len);
sink(">", 1, writer);
break;
case SERD_TURTLE:
@ -420,9 +480,9 @@ write_node(SerdWriter* writer, @@ -420,9 +480,9 @@ write_node(SerdWriter* writer,
SerdNode prefix;
SerdChunk suffix;
if (serd_env_qualify(writer->env, node, &prefix, &suffix)) {
write_text(writer, WRITE_URI, prefix.buf, prefix.n_bytes);
write_uri(writer, prefix.buf, prefix.n_bytes);
sink(":", 1, writer);
write_text(writer, WRITE_URI, suffix.buf, suffix.len);
write_uri(writer, suffix.buf, suffix.len);
break;
}
}
@ -442,7 +502,7 @@ write_node(SerdWriter* writer, @@ -442,7 +502,7 @@ write_node(SerdWriter* writer,
&uri, &writer->base_uri, root, uri_sink, writer);