>From 8ce8fc66bd6d994194eabd2768aefccbe2090e43 Mon Sep 17 00:00:00 2001
From: Eli Zaretskii
Date: Fri, 18 Dec 2015 17:03:26 +0200
Subject: [PATCH] Support non-ASCII URLs
* src/url.c [HAVE_ICONV]: Include iconv.h and langinfo.h.
(convert_fname): New function.
[HAVE_ICONV]: Convert file name from remote encoding to local
encoding.
(url_file_name): Call convert_fname.
(filechr_table): Don't consider bytes in 128..159 as control
characters.
* tests/Test-ftp-iri.px: Fix the expected file name to match the
new file-name recoding. State the remote encoding explicitly on
the Wget command line.
* NEWS: Mention the URI recoding when built with libiconv.
---
NEWS | 7 +++++
src/url.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++--
tests/Test-ftp-iri.px | 4 +--
3 files changed, 94 insertions(+), 4 deletions(-)
diff --git a/NEWS b/NEWS
index c8cebad..c63c678 100644
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,13 @@ Please send GNU Wget bug reports to .
* Changes in Wget X.Y.Z
+* When Wget is built with libiconv, it now converts non-ASCII URIs to
+ the locale's codeset when it creates files. The encoding of the
+ remote files and URIs is taken from --remote-encoding, defaulting to
+ UTF-8. The result is that non-ASCII URIs and files downloaded via
+ HTTP/HTTPS and FTP will have names on the local filesystem that
+ correspond to their remote names.
+
* Changes in Wget 1.17.1
* Fix compile error when IPv6 is disabled or SSL is not present.
diff --git a/src/url.c b/src/url.c
index c62867f..ca7fe29 100644
--- a/src/url.c
+++ b/src/url.c
@@ -43,6 +43,11 @@ as that of the covered work. */
#include "host.h" /* for is_valid_ipv6_address */
#include "c-strcase.h"
+#if HAVE_ICONV
+#include
+#include
+#endif
+
#ifdef __VMS
#include "vms.h"
#endif /* def __VMS */
@@ -1399,8 +1404,8 @@ UVWC, VC, VC, VC, VC, VC, VC, VC, /* NUL SOH STX ETX EOT ENQ ACK BEL */
0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */
- C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */
- C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -1531,6 +1536,82 @@ append_uri_pathel (const char *b, const char *e, bool escaped,
append_null (dest);
}
+static char *
+convert_fname (const char *fname)
+{
+ char *converted_fname = (char *)fname;
+#if HAVE_ICONV
+ const char *from_encoding = opt.encoding_remote;
+ const char *to_encoding = opt.locale;
+ iconv_t cd;
+ size_t len, done, inlen, outlen;
+ char *s;
+ const char *orig_fname = fname;;
+
+ /* Defaults for remote and local encodings. */
+ if (!from_encoding)
+ from_encoding = "UTF-8";
+ if (!to_encoding)
+ to_encoding = nl_langinfo (CODESET);
+
+ cd = iconv_open (to_encoding, from_encoding);
+ if (cd == (iconv_t)(-1))
+ logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"),
+ quote (from_encoding), quote (to_encoding));
+ else
+ {
+ inlen = strlen (fname);
+ len = outlen = inlen * 2;
+ converted_fname = s = xmalloc (outlen + 1);
+ done = 0;
+
+ for (;;)
+ {
+ if (iconv (cd, &fname, &inlen, &s, &outlen) != (size_t)(-1)
+ && iconv (cd, NULL, NULL, &s, &outlen) != (size_t)(-1))
+ {
+ *(converted_fname + len - outlen - done) = '\0';
+ iconv_close(cd);
+ DEBUGP (("Converted file name '%s' (%s) -> '%s' (%s)\n",
+ orig_fname, from_encoding, converted_fname, to_encoding));
+ xfree (orig_fname);
+ return converted_fname;
+ }
+
+ /* Incomplete or invalid multibyte sequence */
+ if (errno == EINVAL || errno == EILSEQ)
+ {
+ logprintf (LOG_VERBOSE,
+ _("Incomplete or invalid multibyte sequence encountered\n"));
+ xfree (converted_fname);
+ converted_fname = (char *)orig_fname;
+ break;
+ }
+ else if (errno == E2BIG) /* Output buffer full */
+ {
+ done = len;
+ len = outlen = done + inlen * 2;
+ converted_fname = xrealloc (converted_fname, outlen + 1);
+ s = converted_fname + done;
+ }
+ else /* Weird, we got an unspecified error */
+ {
+ logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno);
+ xfree (converted_fname);
+ converted_fname = (char *)orig_fname;
+ break;
+ }
+ }
+ DEBUGP (("Failed to convert file name '%s' (%s) -> '?' (%s)\n",
+ orig_fname, from_encoding, to_encoding));
+ }
+
+ iconv_close(cd);
+#endif
+
+ return converted_fname;
+}
+
/* Append to DEST the directory structure that corresponds the
directory part of URL's path. For example, if the URL is
http://server/dir1/dir2/file, this appends "/dir1/dir2".
@@ -1706,6 +1787,8 @@ url_file_name (const struct url *u, char *replaced_filename)
xfree (temp_fnres.base);
+ fname = convert_fname (fname);
+
/* Check the cases in which the unique extensions are not used:
1) Clobbering is turned off (-nc).
2) Retrieval with regetting.
diff --git a/tests/Test-ftp-iri.px b/tests/Test-ftp-iri.px
index e196dbe..42e0eca 100755
--- a/tests/Test-ftp-iri.px
+++ b/tests/Test-ftp-iri.px
@@ -26,12 +26,12 @@ my %urls = (
},
);
-my $cmdline = $WgetTest::WGETPATH . " --local-encoding=iso-8859-1 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
+my $cmdline = $WgetTest::WGETPATH . " --local-encoding=iso-8859-1 --remote-encoding=utf-8 -S ftp://localhost:{{port}}/fran${ccedilla_l1}ais.txt";
my $expected_error_code = 0;
my %expected_downloaded_files = (
- "fran${ccedilla_u8}ais.txt" => {
+ "fran${ccedilla_l1}ais.txt" => {
content => $francais,
},
);
--
2.6.4.windows.1