aboutsummaryrefslogtreecommitdiffstats
path: root/copypage.pl
diff options
context:
space:
mode:
authorPeter Karlsson <peterk>2003-01-18 18:34:13 +0000
committerPeter Karlsson <peterk>2003-01-18 18:34:13 +0000
commitc55f71887d620f13e99014f5a25350dc79890e92 (patch)
tree803c01dcaeb16690c4c02f1350f68245e43449e3 /copypage.pl
parentb52353a93e5a64487b77cdd508720bc297c48af3 (diff)
Since we are forced to use entities at some places in the English source
files, when copying the page first decode any entities for the ISO 8859-1 area. The entities are then re-introduced if required by the output encoding. CVS version numbers copypage.pl: 1.22 -> 1.23
Diffstat (limited to 'copypage.pl')
-rwxr-xr-xcopypage.pl32
1 files changed, 28 insertions, 4 deletions
diff --git a/copypage.pl b/copypage.pl
index 723a1c354d1..de66d21bb10 100755
--- a/copypage.pl
+++ b/copypage.pl
@@ -7,7 +7,7 @@
# Makefiles are not supported anymore for they bear too much space for errors.
# Originally written 2000-02-26 by Peter Karlsson <peterk@debian.org>
-# © Copyright 2000-2002 Software in the public interest, Inc.
+# © Copyright 2000-2003 Software in the public interest, Inc.
# This program is released under the GNU General Public License, v2.
# $Id$
@@ -245,15 +245,27 @@ sub copy
}
else
{
+ # Transform the string into a string that is fit for the encoding
+ # of the output language. We do that by first converting any
+ # SGML entities in the input stream into 8-bit ISO 8859-1
+ # encoding, and then convert extended characters (back) into
+ # entities if necessary for the target encoding.
+
+ # Decode
+ s/(&[^#;]+;)/&decodeentity($1)/ge;
+ s/&#(1[6-9][0-9]|2[0-4][0-9]|25[0-5]);/chr($1)/ge;
+
+ # Encode
if (defined $compattable)
{
- # Recode non-ASCII characters that are not identical to
- # ISO 8859-1 into entitites
+ # Output encoding is in part compatible with ISO 8859-1, only
+ # convert incompatible characters into entities.
s/([\xA0-\xFF])/$$compattable[ord($1)-160]?$1:$entities[ord($1)-160]/ge;
}
elsif ($recodelatin1)
{
- # Recode any non-ASCII characters into entities
+ # Output encoding is incompatible with ISO 8859-1, convert all
+ # 8-bit characters into entities.
s/([\xA0-\xFF])/$entities[ord($1)-160]/ge;
}
@@ -274,3 +286,15 @@ sub copy
print "and to remove $dsttitle when finished\n"
if defined $dsttitle;
}
+
+# Return the ISO-8859-1 character that corresponds to the given entity
+sub decodeentity
+{
+ my $ent = shift;
+ # Start at one to avoid decoding &nbsp;
+ for (my $i = 1; $i < $#entities; ++ $i)
+ {
+ return chr($i + 160) if $entities[$i] eq $ent;
+ }
+ return $ent;
+}

© 2014-2024 Faster IT GmbH | imprint | privacy policy