From 074101851eb07a8cbd27738f86c2c00178efa420 Mon Sep 17 00:00:00 2001
From: Peter Karlsson <peterk>
Date: Wed, 28 Aug 2002 18:11:29 +0000
Subject: If destination language's encoding is something other than
 iso-8859-1, encode all non-ASCII characters as entities.

CVS version numbers

copypage.pl: 1.16 -> 1.17
---
 copypage.pl | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

(limited to 'copypage.pl')
diff --git a/copypage.pl b/copypage.pl
index 5d497dac988..1a66abe9751 100755
--- a/copypage.pl
+++ b/copypage.pl
@@ -49,13 +49,27 @@ if ($#ARGV == -1)
 	exit;
 }
 
+# Check destination character encoding
+my $charset = 'iso-8859-1';
+if (open WMLRC, "$language/.wmlrc")
+{
+	while (<WMLRC>)
+	{
+		if (/^-D CHARSET=(.*)$/)
+		{
+			$charset = lc($1);
+			last;
+		}
+	}
+}
+
 # Loop over command line
 foreach $page (@ARGV)
 {
 	# Check if valid source
 	if ($page =~ /wml$/)
 	{
-		&copy($page);
+		&copy($page, $charset ne 'iso-8859-1');
 	}
 	else
 	{
@@ -67,6 +81,7 @@ foreach $page (@ARGV)
 sub copy
 {
 	my $page = shift;
+	my $recodelatin1 = shift;
 	print "Processing $page...\n";
 
 	# Remove english/ from path
@@ -192,6 +207,12 @@ sub copy
 		}
 		else
 		{
+			if ($recodelatin1)
+			{
+				# Recode any non-ASCII characters as entities
+				s/([\xA0-\xFF])/&entity($1)/ge;
+			}
+
 			print DST $_;
 		}
 	}
@@ -210,3 +231,9 @@ sub copy
 		if defined $dsttitle;
 }
 
+# Subroutine to encode a latin-1 character as a HTML entity
+sub entity
+{
+	# Exploiting the fact that latin-1 is a subset of Unicode
+	return '&#' . ord(shift) . ';'
+}
-- 
cgit v1.2.3