/* wget.c - Simple downloader to get the resource file from a HTTP server * * Copyright 2016 Lipi C.H. Lee * Copyright 2021 Eric Molitor * * Relevant sources of information * ------------------------------- * HTTP 1.1: https://www.rfc-editor.org/rfc/rfc7230 * Chunked Encoding: https://www.rfc-editor.org/rfc/rfc7230#section-4.1 * UTF-8 Encoded Header Values https://www.rfc-editor.org/rfc/rfc5987 * * Test URLs * --------- * Chunked Encoding: https://jigsaw.w3.org/HTTP/ChunkedScript * Redirect 301: https://jigsaw.w3.org/HTTP/300/301.html * Redirect 302: https://jigsaw.w3.org/HTTP/300/302.html * TLS 1.0: https://tls-v1-0.badssl.com:1010/ * TLS 1.1: https://tls-v1-1.badssl.com:1011/ * TLS 1.2: https://tls-v1-2.badssl.com:1012/ * TLS 1.3: https://tls13.1d.pw/ * Transfer Encoding [gzip|deflate]: https://jigsaw.w3.org/HTTP/TE/bar.txt * * * todo: Add support for configurable TLS versions * todo: Add support for ftp * todo: Add support for Transfer Encoding (gzip|deflate) * todo: Add support for RFC5987 USE_WGET(NEWTOY(wget, "<1>1(max-redirect)#<0=20d(debug)O(output-document):p(post-data):", TOYFLAG_USR|TOYFLAG_BIN)) config WGET bool "wget" default n help usage: wget [OPTIONS]... [URL] --max-redirect maximum redirections allowed -d, --debug print lots of debugging information -O, --output-document=FILE specify output filename -p, --post-data=DATA send data in body of POST request examples: wget http://www.example.com config WGET_LIBTLS bool "Enable HTTPS support for wget via LibTLS" default n depends on WGET && !WGET_OPENSSL help Enable HTTPS support for wget by linking to LibTLS. Supports using libtls, libretls or libtls-bearssl. config WGET_OPENSSL bool "Enable HTTPS support for wget via OpenSSL" default n depends on WGET && !WGET_LIBTLS help Enable HTTPS support for wget by linking to OpenSSL. */ #define FOR_wget #include "toys.h" #if CFG_WGET_LIBTLS #define WGET_SSL 1 #include #elif CFG_WGET_OPENSSL #define WGET_SSL 1 #include #include #include #else #define WGET_SSL 0 #endif #define HTTPS (WGET_SSL && TT.https) GLOBALS( char *p, *O; long max_redirect; int sock, https; char *url; #if CFG_WGET_LIBTLS struct tls *tls; #elif CFG_WGET_OPENSSL struct ssl_ctx_st *ctx; struct ssl_st *ssl; #endif ) // get http info in URL static void wget_info(char *url, char **host, char **port, char **path) { char *ss = url; // Must start with case insensitive http:// or https:// if (strncmp(url, "http", 4)) url = 0; else { url += 4; if ((TT.https = WGET_SSL && toupper(*url=='s'))) url++; if (!strstart(&url, "://")) url = 0; } if (!url) error_exit("unsupported protocol: %s", ss); if ((*path = strchr(*host = url, '/'))) *((*path)++) = 0; else *path = ""; // Get port number and trim literal IPv6 addresses if (**host=='[' && (ss = strchr(++*host, ']'))) { *ss++ = 0; *port = (*ss==':') ? ++ss : 0; } else if ((*port = strchr(*host, ':'))) *(*port++) = 0; if (!*port) *port = HTTPS ? "443" : "80"; } static void wget_connect(char *host, char *port) { if (!HTTPS) TT.sock = xconnectany(xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0)); else { #if CFG_WGET_LIBTLS struct tls_config *cfg = NULL; uint32_t protocols; if (!(TT.tls = tls_client())) error_exit("tls_client: %s", tls_error(TT.tls)); if (!(cfg = tls_config_new())) error_exit("tls_config_new: %s", tls_config_error(cfg)); if (tls_config_parse_protocols(&protocols, "tlsv1.2")) error_exit("tls_config_parse_protocols"); if (tls_config_set_protocols(cfg, protocols)) error_exit("tls_config_set_protocols: %s", tls_config_error(cfg)); if (tls_configure(TT.tls, cfg)) error_exit("tls_configure: %s", tls_error(TT.tls)); tls_config_free(cfg); if (tls_connect(TT.tls, host, port)) error_exit("tls_connect: %s", tls_error(TT.tls)); #elif CFG_WGET_OPENSSL SSL_library_init(); OpenSSL_add_all_algorithms(); SSL_load_error_strings(); ERR_load_crypto_strings(); TT.ctx = SSL_CTX_new(TLS_client_method()); if (!TT.ctx) error_exit("SSL_CTX_new"); TT.sock = xconnectany(xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0)); TT.ssl = SSL_new(TT.ctx); if (!TT.ssl) error_exit("SSL_new: %s", ERR_error_string(ERR_get_error(), NULL)); if (!SSL_set_tlsext_host_name(TT.ssl, host)) error_exit("SSL_set_tlsext_host_name: %s", ERR_error_string(ERR_get_error(), NULL)); SSL_set_fd(TT.ssl, TT.sock); if (SSL_connect(TT.ssl) == -1) error_exit("SSL_set_fd: %s", ERR_error_string(ERR_get_error(), NULL)); if (FLAG(d)) printf("TLS: %s\n", SSL_get_cipher(TT.ssl)); #endif } } static size_t wget_read(void *buf, size_t len) { if (!HTTPS) return xread(TT.sock, buf, len); else { char *err = 0; int ret; #if CFG_WGET_LIBTLS if ((ret = tls_read(TT.tls, buf, len))<0) err = tls_error(TT.tls); #elif CFG_WGET_OPENSSL if ((ret = SSL_read(TT.ssl, buf, len))<0) err = ERR_error_string(ERR_get_error(), 0); #endif if (err) error_exit("https read: %s", err); return ret; } } static void wget_write(void *buf, size_t len) { if (!HTTPS) xwrite(TT.sock, buf, len); else { char *err = 0; #if CFG_WGET_LIBTLS if (len != tls_write(TT.tls, buf, len)) err = tls_error(TT.tls); #elif CFG_WGET_OPENSSL if (len != SSL_write(TT.ssl, buf, len)) err = ERR_error_string(ERR_get_error(), 0); #endif if (err) error_exit("https write: %s", err); } } static void wget_close() { if (TT.sock) { xclose(TT.sock); TT.sock = 0; } #if CFG_WGET_LIBTLS if (TT.tls) { tls_close(TT.tls); tls_free(TT.tls); TT.tls = 0; } #elif CFG_WGET_OPENSSL if (TT.ssl) { SSL_shutdown(TT.ssl); SSL_free(TT.ssl); TT.ssl = 0; } if (TT.ctx) { SSL_CTX_free(TT.ctx); TT.ctx = 0; } #endif } static char *wget_find_header(char *header, char *val) { char *result = strcasestr(header, val); if (result) { result += strlen(val); result[strcspn(result, "\r\n")] = 0; } return result; } void wget_main(void) { long status = 0; size_t len, c_len = 0; int fd = 0; char *body, *index, *host, *port, *path, *chunked, *ss; char agent[] = "toybox wget/" TOYBOX_VERSION; TT.url = xstrdup(*toys.optargs); // Ask server for URL, following redirects until success while (status != 200) { if (!TT.max_redirect--) error_exit("Too many redirects"); // Connect and write request wget_info(TT.url, &host, &port, &path); if (TT.p) sprintf(toybuf, "Content-Length: %ld\r\n", strlen(TT.p)); ss = xmprintf("%s /%s HTTP/1.1\r\nHost: %s\r\nUser-Agent: %s\r\n" "Connection: close\r\n%s\r\n%s", FLAG(p) ? "POST" : "GET", path, host, agent, FLAG(p) ? toybuf : "", FLAG(p)?TT.p:""); if (FLAG(d)) printf("--- Request\n%s", ss); wget_connect(host, port); wget_write(ss, strlen(ss)); free(ss); // Read HTTP response into toybuf (probably with some body at end) for (index = toybuf; (len = wget_read(index, sizeof(toybuf)-(index-toybuf)))>0; index += len); // Split response into header and body, and null terminate header. // (RFC7230 says header cannot contain NUL.) if (!(body = memmem(ss = toybuf, index-toybuf, "\r\n\r\n", 4))) error_exit("response header too large"); *body = 0; body += 4; len = index-body; if (FLAG(d)) printf("--- Response\n%s\n\n", toybuf); status = strstart(&ss, "HTTP/1.1 ") ? strtol(ss, 0, 10) : 0; if ((status == 301) || (status == 302)) { if (!(ss = wget_find_header(toybuf, "Location: "))) error_exit("bad redirect"); free(TT.url); TT.url = xstrdup(ss); wget_close(); } else if (status != 200) error_exit("response: %ld", status); } // Open output file if (TT.O && !strcmp(TT.O, "-")) fd = 1; else if (!TT.O) { ss = wget_find_header(toybuf, "Content-Disposition: attachment; filename="); if (!ss && strchr(path, '/')) ss = getbasename(path); if (!ss || !*ss ) ss = "index.html"; if (!access((TT.O = ss), F_OK)) error_exit("%s already exists", TT.O); } // TODO: don't allow header/basename to write to stdout if (!fd) fd = xcreate(TT.O, (O_WRONLY|O_CREAT|O_TRUNC), 0644); // If chunked we offset the first buffer by 2 character, meaning it is // pointing at half of the header boundary, aka '\r\n'. This simplifies // parsing of the first c_len length by allowing the do while loop to fall // through on the first iteration and parse the first c_len size. chunked = wget_find_header(toybuf, "transfer-encoding: chunked"); if (chunked) memmove(toybuf, body-2, len += 2); else memmove(toybuf, body, len); // len is the size remaining in toybuf // c_len is the size of the remaining bytes in the current chunk do { if (chunked) { if (c_len > 0) { // We have an incomplete c_len to write if (len <= c_len) { // Buffer is less than the c_len so full write xwrite(fd, toybuf, len); c_len = c_len - len; len = 0; } else { // Buffer is larger than the c_len so partial write xwrite(fd, toybuf, c_len); len = len - c_len; memmove(toybuf, toybuf + c_len, len); c_len = 0; } } // If len is less than 2 we can't validate the chunk boundary so fall // through and go read more into toybuf. if (!c_len && (len > 2)) { char *c; if (strncmp(toybuf, "\r\n", 2) != 0) error_exit("chunk boundary"); // If we can't find the end of the new chunk signature fall through and // read more into toybuf. c = memmem(toybuf + 2, len - 2, "\r\n",2); if (c) { c_len = strtol(toybuf + 2, NULL, 16); if (!c_len) break; // A c_len of zero means we are complete len = len - (c - toybuf) - 2; memmove(toybuf, c + 2, len); } } if (len == sizeof(toybuf)) error_exit("chunk overflow"); } else { xwrite(fd, toybuf, len); len = 0; } } while ((len += wget_read(toybuf + len, sizeof(toybuf) - len)) > 0); wget_close(); free(TT.url); }