[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Bug-wget] Support non-ASCII URLs
From: |
Eli Zaretskii |
Subject: |
Re: [Bug-wget] Support non-ASCII URLs |
Date: |
Tue, 15 Dec 2015 19:18:03 +0200 |
This second part is the main part of the change. It uses 'iconv',
when available, to convert the file names to the local encoding,
before saving the files. Note that the same function I modified is
used by ftp.c, so downloading via FTP should also work with non-ASCII
file names now; however, I didn't test that.
Thanks.
diff --git a/src/url.c b/src/url.c
index c62867f..d984bf7 100644
--- a/src/url.c
+++ b/src/url.c
@@ -43,6 +43,11 @@ as that of the covered work. */
#include "host.h" /* for is_valid_ipv6_address */
#include "c-strcase.h"
+#if HAVE_ICONV
+#include <iconv.h>
+#include <langinfo.h>
+#endif
+
#ifdef __VMS
#include "vms.h"
#endif /* def __VMS */
@@ -1531,6 +1536,90 @@ append_uri_pathel (const char *b, const char *e, bool
escaped,
append_null (dest);
}
+static char *
+convert_fname (const char *fname)
+{
+ char *converted_fname = (char *)fname;
+#if HAVE_ICONV
+ const char *from_encoding = opt.encoding_remote;
+ const char *to_encoding = opt.locale;
+ iconv_t cd;
+ /* sXXXav : hummm hard to guess... */
+ size_t len, done, inlen, outlen;
+ char *s;
+ const char *orig_fname = fname;;
+
+ /* Defaults for remote and local encodings. */
+ if (!from_encoding)
+ from_encoding = "UTF-8";
+ if (!to_encoding)
+ to_encoding = nl_langinfo (CODESET);
+
+ cd = iconv_open (to_encoding, from_encoding);
+ if (cd == (iconv_t)(-1))
+ logprintf (LOG_VERBOSE, _("Conversion from %s to %s isn't supported\n"),
+ quote (from_encoding), quote (to_encoding));
+ else
+ {
+ inlen = strlen (fname);
+ len = outlen = inlen * 2;
+ converted_fname = s = xmalloc (outlen + 1);
+ done = 0;
+
+ for (;;)
+ {
+ if (iconv (cd, &fname, &inlen, &s, &outlen) != (size_t)(-1))
+ {
+ /* Flush the last bytes. */
+ iconv (cd, NULL, NULL, &s, &outlen);
+ *(converted_fname + len - outlen - done) = '\0';
+ iconv_close(cd);
+ DEBUGP (("Converted file name '%s' (%s) -> '%s' (%s)\n",
+ orig_fname, from_encoding, converted_fname,
to_encoding));
+ return converted_fname;
+ }
+
+ /* Incomplete or invalid multibyte sequence */
+ if (errno == EINVAL || errno == EILSEQ)
+ {
+ logprintf (LOG_VERBOSE,
+ _("Incomplete or invalid multibyte sequence
encountered\n"));
+ xfree (converted_fname);
+ converted_fname = (char *)orig_fname;
+ break;
+ }
+ else if (errno == E2BIG) /* Output buffer full */
+ {
+ char *new;
+
+ done = len;
+ outlen = done + inlen * 2;
+ new = xmalloc (outlen + 1);
+ memcpy (new, converted_fname, done);
+ xfree (converted_fname);
+ converted_fname = new;
+ len = outlen;
+ s = converted_fname + done;
+ }
+ else /* Weird, we got an unspecified error */
+ {
+ logprintf (LOG_VERBOSE, _("Unhandled errno %d\n"), errno);
+ xfree (converted_fname);
+ converted_fname = (char *)orig_fname;
+ break;
+ }
+ }
+ DEBUGP (("Failed to convert file name '%s' (%s) -> '?' (%s)\n",
+ orig_fname, from_encoding, to_encoding));
+ xfree (fname);
+ }
+
+ iconv_close(cd);
+#endif
+
+ return converted_fname;
+}
+
/* Append to DEST the directory structure that corresponds the
directory part of URL's path. For example, if the URL is
http://server/dir1/dir2/file, this appends "/dir1/dir2".
@@ -1706,6 +1795,8 @@ url_file_name (const struct url *u, char
*replaced_filename)
xfree (temp_fnres.base);
+ fname = convert_fname (fname);
+
/* Check the cases in which the unique extensions are not used:
1) Clobbering is turned off (-nc).
2) Retrieval with regetting.
- Re: [Bug-wget] URL encoding issues (Was: GNU wget 1.17.1 released), (continued)
- Re: [Bug-wget] URL encoding issues (Was: GNU wget 1.17.1 released), Tim Ruehsen, 2015/12/15
- Re: [Bug-wget] URL encoding issues (Was: GNU wget 1.17.1 released), Tim Ruehsen, 2015/12/15
- Re: [Bug-wget] URL encoding issues (Was: GNU wget 1.17.1 released), Eli Zaretskii, 2015/12/15
- Re: [Bug-wget] URL encoding issues (Was: GNU wget 1.17.1 released), Tim Ruehsen, 2015/12/17
- Re: [Bug-wget] URL encoding issues (Was: GNU wget 1.17.1 released), Andries E. Brouwer, 2015/12/15
- Re: [Bug-wget] URL encoding issues (Was: GNU wget 1.17.1 released), Tim Ruehsen, 2015/12/15
- Re: [Bug-wget] Support non-ASCII URLs (Was: GNU wget 1.17.1 released), Eli Zaretskii, 2015/12/15
- Re: [Bug-wget] Support non-ASCII URLs,
Eli Zaretskii <=
- Re: [Bug-wget] Support non-ASCII URLs, Giuseppe Scrivano, 2015/12/16
- Re: [Bug-wget] Support non-ASCII URLs, Eli Zaretskii, 2015/12/16
- Re: [Bug-wget] Support non-ASCII URLs, Tim Ruehsen, 2015/12/17
- Re: [Bug-wget] Support non-ASCII URLs, Giuseppe Scrivano, 2015/12/17
- Re: [Bug-wget] Support non-ASCII URLs, Eli Zaretskii, 2015/12/17
- Re: [Bug-wget] Support non-ASCII URLs, Tim Rühsen, 2015/12/17
- Re: [Bug-wget] Support non-ASCII URLs, Eli Zaretskii, 2015/12/17
- Re: [Bug-wget] Support non-ASCII URLs, Giuseppe Scrivano, 2015/12/18
- Re: [Bug-wget] Support non-ASCII URLs, Eli Zaretskii, 2015/12/18
- Re: [Bug-wget] Support non-ASCII URLs, Giuseppe Scrivano, 2015/12/18