aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Karlsson <peterk>2003-01-18 18:34:13 +0000
committerPeter Karlsson <peterk>2003-01-18 18:34:13 +0000
commitc55f71887d620f13e99014f5a25350dc79890e92 (patch)
tree803c01dcaeb16690c4c02f1350f68245e43449e3
parentb52353a93e5a64487b77cdd508720bc297c48af3 (diff)
Since we are forced to use entities at some places in the English source
files, when copying the page first decode any entities for the ISO 8859-1 area. The entities are then re-introduced if required by the output encoding. CVS version numbers copypage.pl: 1.22 -> 1.23
-rwxr-xr-xcopypage.pl32
1 files changed, 28 insertions, 4 deletions
diff --git a/copypage.pl b/copypage.pl
index 723a1c354d1..de66d21bb10 100755
--- a/copypage.pl
+++ b/copypage.pl
@@ -7,7 +7,7 @@
# Makefiles are not supported anymore for they bear too much space for errors.
# Originally written 2000-02-26 by Peter Karlsson <peterk@debian.org>
-# © Copyright 2000-2002 Software in the public interest, Inc.
+# © Copyright 2000-2003 Software in the public interest, Inc.
# This program is released under the GNU General Public License, v2.
# $Id$
@@ -245,15 +245,27 @@ sub copy
}
else
{
+ # Transform the string into a string that is fit for the encoding
+ # of the output language. We do that by first converting any
+ # SGML entities in the input stream into 8-bit ISO 8859-1
+ # encoding, and then convert extended characters (back) into
+ # entities if necessary for the target encoding.
+
+ # Decode
+ s/(&[^#;]+;)/&decodeentity($1)/ge;
+ s/&#(1[6-9][0-9]|2[0-4][0-9]|25[0-5]);/chr($1)/ge;
+
+ # Encode
if (defined $compattable)
{
- # Recode non-ASCII characters that are not identical to
- # ISO 8859-1 into entitites
+ # Output encoding is in part compatible with ISO 8859-1, only
+ # convert incompatible characters into entities.
s/([\xA0-\xFF])/$$compattable[ord($1)-160]?$1:$entities[ord($1)-160]/ge;
}
elsif ($recodelatin1)
{
- # Recode any non-ASCII characters into entities
+ # Output encoding is incompatible with ISO 8859-1, convert all
+ # 8-bit characters into entities.
s/([\xA0-\xFF])/$entities[ord($1)-160]/ge;
}
@@ -274,3 +286,15 @@ sub copy
print "and to remove $dsttitle when finished\n"
if defined $dsttitle;
}
+
+# Return the ISO-8859-1 character that corresponds to the given entity
+sub decodeentity
+{
+ my $ent = shift;
+ # Start at one to avoid decoding &nbsp;
+ for (my $i = 1; $i < $#entities; ++ $i)
+ {
+ return chr($i + 160) if $entities[$i] eq $ent;
+ }
+ return $ent;
+}

© 2014-2024 Faster IT GmbH | imprint | privacy policy