Serd subproject with meson
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

400 lines
9.7 KiB

/* Serd, an RDF serialisation library.
* Copyright 2011 David Robillard <d@drobilla.net>
*
* Serd is free software: you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Serd is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "serd/serd.h"
//#define URI_DEBUG 1
/** Return true if @a c lies within [min...max] (inclusive) */
static inline bool
in_range(const char c, const char min, const char max)
{
return (c >= min && c <= max);
}
/** RFC2234: ALPHA := %x41-5A / %x61-7A ; A-Z / a-z */
static inline bool
is_alpha(const uint8_t c)
{
return in_range(c, 'A', 'Z') || in_range(c, 'a', 'z');
}
/** RFC2234: DIGIT ::= %x30-39 ; 0-9 */
static inline bool
is_digit(const uint8_t c)
{
return in_range(c, '0', '9');
}
SERD_API
bool
serd_uri_string_has_scheme(const uint8_t* utf8)
{
// RFC3986: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
if (!is_alpha(utf8[0])) {
return false; // Invalid scheme initial character, URI is relative
}
for (uint8_t c = *++utf8; (c = *utf8) != '\0'; ++utf8) {
switch (c) {
case ':':
return true; // End of scheme
case '+': case '-': case '.':
break; // Valid scheme character, continue
default:
if (!is_alpha(c) && !is_digit(c)) {
return false; // Invalid scheme character
}
}
}
return false;
}
#ifdef URI_DEBUG
static void
serd_uri_dump(const SerdURI* uri, FILE* file)
{
#define PRINT_PART(range, name) \
if (range.buf) { \
fprintf(stderr, " " name " = "); \
fwrite((range).buf, 1, (range).len, stderr); \
fprintf(stderr, "\n"); \
}
PRINT_PART(uri->scheme, "scheme");
PRINT_PART(uri->authority, "authority");
PRINT_PART(uri->path_base, "path_base");
PRINT_PART(uri->path, "path");
PRINT_PART(uri->query, "query");
PRINT_PART(uri->fragment, "fragment");
}
#endif
SERD_API
bool
serd_uri_parse(const uint8_t* utf8, SerdURI* uri)
{
*uri = SERD_URI_NULL;
assert(uri->path_base.buf == NULL);
assert(uri->path_base.len == 0);
assert(uri->authority.len == 0);
const uint8_t* ptr = utf8;
/* See http://tools.ietf.org/html/rfc3986#section-3
URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
*/
/* S3.1: scheme ::= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */
if (is_alpha(*ptr)) {
for (uint8_t c = *++ptr; true; c = *++ptr) {
switch (c) {
case '\0': case '/': case '?': case '#':
ptr = utf8;
goto path; // Relative URI (starts with path by definition)
case ':':
uri->scheme.buf = utf8;
uri->scheme.len = (ptr++) - utf8;
goto maybe_authority; // URI with scheme
case '+': case '-': case '.':
continue;
default:
if (is_alpha(c) || is_digit(c)) {
continue;
}
}
}
}
/* S3.2: The authority component is preceded by a double slash ("//")
and is terminated by the next slash ("/"), question mark ("?"),
or number sign ("#") character, or by the end of the URI.
*/
maybe_authority:
if (*ptr == '/' && *(ptr + 1) == '/') {
ptr += 2;
uri->authority.buf = ptr;
assert(uri->authority.len == 0);
for (uint8_t c = *ptr; (c = *ptr) != '\0'; ++ptr) {
switch (c) {
case '/': goto path;
case '?': goto query;
case '#': goto fragment;
default:
++uri->authority.len;
}
}
}
/* RFC3986 S3.3: The path is terminated by the first question mark ("?")
or number sign ("#") character, or by the end of the URI.
*/
path:
switch (*ptr) {
case '?': goto query;
case '#': goto fragment;
case '\0': goto end;
default: break;
}
uri->path.buf = ptr;
uri->path.len = 0;
for (uint8_t c = *ptr; (c = *ptr) != '\0'; ++ptr) {
switch (c) {
case '?': goto query;
case '#': goto fragment;
default:
++uri->path.len;
}
}
/* RFC3986 S3.4: The query component is indicated by the first question
mark ("?") character and terminated by a number sign ("#") character
or by the end of the URI.
*/
query:
if (*ptr == '?') {
uri->query.buf = ++ptr;
for (uint8_t c = *ptr; (c = *ptr) != '\0'; ++ptr) {
switch (c) {
case '#':
goto fragment;
default:
++uri->query.len;
}
}
}
/* RFC3986 S3.5: A fragment identifier component is indicated by the
presence of a number sign ("#") character and terminated by the end
of the URI.
*/
fragment:
if (*ptr == '#') {
uri->fragment.buf = ptr;
while (*ptr++ != '\0') {
++uri->fragment.len;
}
}
end:
#ifdef URI_DEBUG
fprintf(stderr, "PARSE URI <%s>\n", utf8);
serd_uri_dump(uri, stderr);
fprintf(stderr, "\n");
#endif
return true;
}
SERD_API
bool
serd_uri_resolve(const SerdURI* r, const SerdURI* base, SerdURI* t)
{
assert(!r->scheme.len); // r is relative
// See http://tools.ietf.org/html/rfc3986#section-5.2.2
t->path_base.buf = NULL;
t->path_base.len = 0;
if (r->scheme.len) {
t->scheme = r->scheme;
t->authority = r->authority;
t->path = r->path;
t->query = r->query;
} else {
if (r->authority.len) {
t->authority = r->authority;
t->path = r->path;
t->query = r->query;
} else {
t->path = r->path;
if (!r->path.len) {
t->path_base = base->path;
if (r->query.len) {
t->query = r->query;
} else {
t->query = base->query;
}
} else {
if (r->path.buf[0] != '/') {
t->path_base = base->path;
}
t->query = r->query;
}
t->authority = base->authority;
}
t->scheme = base->scheme;
}
t->fragment = r->fragment;
#ifdef URI_DEBUG
fprintf(stderr, "RESOLVE URI\nBASE:\n");
serd_uri_dump(base, stderr);
fprintf(stderr, "URI:\n");
serd_uri_dump(r, stderr);
fprintf(stderr, "RESULT:\n");
serd_uri_dump(t, stderr);
fprintf(stderr, "\n");
#endif
return true;
}
SERD_API
size_t
serd_uri_serialise(const SerdURI* uri, SerdSink sink, void* stream)
{
// See http://tools.ietf.org/html/rfc3986#section-5.3
size_t write_size = 0;
#define WRITE(buf, len) \
write_size += len; \
if (len) { \
sink((const uint8_t*)buf, len, stream); \
}
#define WRITE_CHAR(c) WRITE(&(c), 1)
#define WRITE_COMPONENT(prefix, field, suffix) \
if ((field).len) { \
for (const uint8_t* c = (const uint8_t*)prefix; *c != '\0'; ++c) { \
WRITE(c, 1); \
} \
WRITE((field).buf, (field).len); \
for (const uint8_t* c = (const uint8_t*)suffix; *c != '\0'; ++c) { \
WRITE(c, 1); \
} \
}
WRITE_COMPONENT("", uri->scheme, ":");
WRITE_COMPONENT("//", uri->authority, "");
if (uri->path_base.len) {
if (!uri->path.buf && (uri->fragment.buf || uri->query.buf)) {
WRITE_COMPONENT("", uri->path_base, "");
} else {
/* Merge paths, removing dot components.
See http://tools.ietf.org/html/rfc3986#section-5.2.3
*/
const uint8_t* uri_first = uri->path.buf;
const uint8_t* uri_end = uri_first;
size_t up = 1;
if (uri_first) {
// Count and skip leading dot components
uri_end = uri->path.buf + uri->path.len;
while (uri_first < uri_end) {
if (!memcmp((const char*)uri_first, "./", 2)) {
uri_first += 2;
} else if (!memcmp((const char*)uri_first, "../", 3)) {
++up;
uri_first += 3;
} else if (!memcmp((const char*)uri_first, "..", 2)) {
++up;
uri_first += 2;
} else if (!memcmp((const char*)uri_first, ".", 1)) {
++uri_first;
} else if (!memcmp((const char*)uri_first, "//", 1)) {
++uri_first;
} else {
break;
}
}
if (uri->path.buf && uri->path_base.buf) {
// Find the up'th last slash
const uint8_t* base_last = uri->path_base.buf + uri->path_base.len - 1;
do {
if (*base_last == '/') {
--up;
}
} while (up > 0 && (--base_last > uri->path_base.buf));
// Write base URI prefix
const size_t base_len = base_last - uri->path_base.buf + 1;
WRITE(uri->path_base.buf, base_len);
} else {
// Relative path is just query or fragment, append it to full base URI
WRITE_COMPONENT("", uri->path_base, "");
}
// Write URI suffix
WRITE(uri_first, uri_end - uri_first);
}
}
} else {
WRITE_COMPONENT("", uri->path, "");
}
WRITE_COMPONENT("?", uri->query, "");
if (uri->fragment.buf) {
// Note uri->fragment.buf includes the leading `#'
WRITE_COMPONENT("", uri->fragment, "");
}
return write_size;
}
static size_t
serd_uri_string_length(const SerdURI* uri)
{
size_t len = uri->path_base.len;
#define ADD_LEN(field, n_delims) \
if ((field).len) { len += (field).len + (n_delims); }
ADD_LEN(uri->path, 1); // + possible leading `/'
ADD_LEN(uri->scheme, 1); // + trailing `:'
ADD_LEN(uri->authority, 2); // + leading `//'
ADD_LEN(uri->query, 1); // + leading `?'
ADD_LEN(uri->fragment, 1); // + leading `#'
return len;
}
static size_t
string_sink(const void* buf, size_t len, void* stream)
{
uint8_t** ptr = (uint8_t**)stream;
memcpy(*ptr, buf, len);
*ptr += len;
return len;
}
SERD_API
SerdString*
serd_string_new_from_uri(const SerdURI* uri, SerdURI* out)
{
const size_t len = serd_uri_string_length(uri);
SerdString* str = malloc(sizeof(SerdString) + len + 1);
str->n_bytes = len + 1;
str->n_chars = len; // FIXME: UTF-8
uint8_t* ptr = str->buf;
const size_t actual_len = serd_uri_serialise(uri, string_sink, &ptr);
str->buf[actual_len] = '\0';
str->n_bytes = actual_len + 1;
str->n_chars = str->n_bytes - 1; // FIXME: UTF-8
#ifdef URI_DEBUG
fwrite("URI: `'", 1, 6, stderr);
fwrite(str->buf, 1, str->n_bytes - 1, stderr);
fwrite("'\n", 1, 2, stderr);
#endif
return str;
}