From 13c273f10c655e3805655c0a6a191551fc34567b Mon Sep 17 00:00:00 2001 From: chrisjbillington Date: Sun, 23 Oct 2022 11:51:33 +1100 Subject: [PATCH] Resolve unicode escape sequences not being processed correctly In `process_unicode_escape_sequences()`, any backslash escape sequences in the original string are escaped upon the first `.encode('unicode-escape')` and therefore round-trip the sequence of `.encode('unicode-escape').decode('unicode-escape')`. That is not what we want - we want these sequences to be passed-through the `.encode` unchanged, so that they will be converted to the character they represent upon `.decode()`. This patch changes the `.encode()` step to pass through any ascii characters unchanged, only escaping non-ascii characters. This ensures any existing backslash escape sequences will be interpreted as the character they represent upon `.decode()`. --- hg-fast-export.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/hg-fast-export.py b/hg-fast-export.py index 93f35bf..406f952 100755 --- a/hg-fast-export.py +++ b/hg-fast-export.py @@ -434,9 +434,15 @@ def load_mapping(name, filename, mapping_is_raw): def process_unicode_escape_sequences(s): # Replace unicode escape sequences in the otherwise UTF8-encoded bytestring s with # the UTF8-encoded characters they represent. We need to do an additional - # .decode('utf8').encode('unicode-escape') to convert any non-ascii characters into - # their escape sequences so that the subsequent .decode('unicode-escape') succeeds: - return s.decode('utf8').encode('unicode-escape').decode('unicode-escape').encode('utf8') + # .decode('utf8').encode('ascii', 'backslashreplace') to convert any non-ascii + # characters into their escape sequences so that the subsequent + # .decode('unicode-escape') succeeds: + return ( + s.decode('utf8') + .encode('ascii', 'backslashreplace') + .decode('unicode-escape') + .encode('utf8') + ) def parse_quoted_line(line): m=quoted_regexp.match(line)