From 13c273f10c655e3805655c0a6a191551fc34567b Mon Sep 17 00:00:00 2001
From: chrisjbillington <chrisjbillington@gmail.com>
Date: Sun, 23 Oct 2022 11:51:33 +1100
Subject: [PATCH] Resolve unicode escape sequences not being processed
 correctly

In `process_unicode_escape_sequences()`, any backslash escape sequences
in the original string are escaped upon the first
`.encode('unicode-escape')` and therefore round-trip the sequence of
`.encode('unicode-escape').decode('unicode-escape')`.

That is not what we want - we want these sequences to be passed-through
the `.encode` unchanged, so that they will be converted to the
character they represent upon `.decode()`.

This patch changes the `.encode()` step to pass through any ascii
characters unchanged, only escaping non-ascii characters. This ensures
any existing backslash escape sequences will be interpreted as the
character they represent upon `.decode()`.
---
 hg-fast-export.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/hg-fast-export.py b/hg-fast-export.py
index 93f35bf..406f952 100755
--- a/hg-fast-export.py
+++ b/hg-fast-export.py
@@ -434,9 +434,15 @@ def load_mapping(name, filename, mapping_is_raw):
   def process_unicode_escape_sequences(s):
     # Replace unicode escape sequences in the otherwise UTF8-encoded bytestring s with
     # the UTF8-encoded characters they represent. We need to do an additional
-    # .decode('utf8').encode('unicode-escape') to convert any non-ascii characters into
-    # their escape sequences so that the subsequent .decode('unicode-escape') succeeds:
-    return s.decode('utf8').encode('unicode-escape').decode('unicode-escape').encode('utf8')
+    # .decode('utf8').encode('ascii', 'backslashreplace') to convert any non-ascii
+    # characters into their escape sequences so that the subsequent
+    # .decode('unicode-escape') succeeds:
+    return (
+      s.decode('utf8')
+      .encode('ascii', 'backslashreplace')
+      .decode('unicode-escape')
+      .encode('utf8')
+    )
 
   def parse_quoted_line(line):
     m=quoted_regexp.match(line)