From 074101851eb07a8cbd27738f86c2c00178efa420 Mon Sep 17 00:00:00 2001 From: Peter Karlsson Date: Wed, 28 Aug 2002 18:11:29 +0000 Subject: If destination language's encoding is something other than iso-8859-1, encode all non-ASCII characters as entities. CVS version numbers copypage.pl: 1.16 -> 1.17 --- copypage.pl | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'copypage.pl') diff --git a/copypage.pl b/copypage.pl index 5d497dac988..1a66abe9751 100755 --- a/copypage.pl +++ b/copypage.pl @@ -49,13 +49,27 @@ if ($#ARGV == -1) exit; } +# Check destination character encoding +my $charset = 'iso-8859-1'; +if (open WMLRC, "$language/.wmlrc") +{ + while () + { + if (/^-D CHARSET=(.*)$/) + { + $charset = lc($1); + last; + } + } +} + # Loop over command line foreach $page (@ARGV) { # Check if valid source if ($page =~ /wml$/) { - ©($page); + ©($page, $charset ne 'iso-8859-1'); } else { @@ -67,6 +81,7 @@ foreach $page (@ARGV) sub copy { my $page = shift; + my $recodelatin1 = shift; print "Processing $page...\n"; # Remove english/ from path @@ -192,6 +207,12 @@ sub copy } else { + if ($recodelatin1) + { + # Recode any non-ASCII characters as entities + s/([\xA0-\xFF])/&entity($1)/ge; + } + print DST $_; } } @@ -210,3 +231,9 @@ sub copy if defined $dsttitle; } +# Subroutine to encode a latin-1 character as a HTML entity +sub entity +{ + # Exploiting the fact that latin-1 is a subset of Unicode + return '&#' . ord(shift) . ';' +} -- cgit v1.2.3