diff options
author | Peter Karlsson <peterk> | 2002-12-05 23:37:40 +0000 |
---|---|---|
committer | Peter Karlsson <peterk> | 2002-12-05 23:37:40 +0000 |
commit | e9609e2a37f81c989af2575b412fddfbb36bf22f (patch) | |
tree | 5baafe43ad93c5e93535af8df9da0c6fef75e8b6 /copypage.pl | |
parent | a8edbb3b304c0aec7b03d75d509d8e0fc1585d5e (diff) |
More intelligent handling of ISO 8859-1 characters when copying pages to
other ISO 8859 encodings: Only recode those high-bit characters that differ
from ISO 8859-1 into entities, keeping as many characters as possible in
their native form.
CVS version numbers
copypage.pl: 1.21 -> 1.22
Diffstat (limited to 'copypage.pl')
-rwxr-xr-x | copypage.pl | 72 |
1 files changed, 54 insertions, 18 deletions
diff --git a/copypage.pl b/copypage.pl index baa4ca5b719..723a1c354d1 100755 --- a/copypage.pl +++ b/copypage.pl @@ -14,6 +14,13 @@ use File::Path; +# Declare variables only used in references to avoid warnings +use vars qw(@iso_8859_2_compat @iso_8859_3_compat @iso_8859_4_compat + @iso_8859_5_compat @iso_8859_6_compat @iso_8859_7_compat + @iso_8859_8_compat @iso_8859_9_compat @iso_8859_10_compat + @iso_8859_13_compat @iso_8859_14_compat @iso_8859_15_compat + @iso_8859_16_compat); + # Get configuration if (exists $ENV{DWWW_LANG}) { @@ -41,24 +48,10 @@ if ($#ARGV == -1) print "and adds the translation-check header with the current revision.\n"; print "If the directory does not exist, it will be created, and the Makefile\n"; print "copied or created, depending on your language.conf setting.\n\n"; - print "You can either keep or not keep the 'english/' part of the path.\n"; + print "The 'english/' part of the input path is optional.\n"; exit; } -# Check destination character encoding -my $recode = 0; -if (open WMLRC, "$language/.wmlrc") -{ - while (<WMLRC>) - { - if (s/^-D CHARSET=//) - { - $recode = 1 unless /^iso-8859-15?$/i; - last; - } - } -} - # Table of entities used when copying to non-latin1 encodings @entities = ( ' ', '¡', '¢', '£', '¤', '¥', @@ -79,13 +72,49 @@ if (open WMLRC, "$language/.wmlrc") 'ü', 'ý', 'þ', 'ÿ' ); +# Compatibility tables for the iso-8859 series; 1 indicates that the +# codepoint is the same as in iso-8859-1. Used to perform partial remaps +# for these. +@iso_8859_2_compat = (1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,1,0,1,1,0,0,0,0,1,1,0,1,1,0,0,1,0,1,1,0,1,0,1,1,0,1,0,0,1,0,1,0,1,0,1,1,0,0,0,0,1,1,0,1,1,0,0,1,0,1,1,0,0); +@iso_8859_3_compat = (1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,1,1,1,1,0,1,1,0,0,0,0,1,0,0,1,1,1,0,1,0,0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,0,1,1,1,1,0,0,1,1,1,1,0,1,0,0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,0,1,1,1,1,0,0,0); +@iso_8859_4_compat = (1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,1,0,1,0,1,1,0,0,0,0,0,1,1,1,1,1,0,1,1,1,0,0,1,0,1,1,1,1,1,1,0,0,1,0,1,0,1,1,0,0,0,0,0,1,1,1,1,1,0,1,1,1,0,0,0); +@iso_8859_5_compat = (1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); +@iso_8859_6_compat = (1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); +@iso_8859_7_compat = (1,0,0,1,0,0,1,1,1,1,0,1,1,1,0,0,1,1,1,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); +@iso_8859_8_compat = (1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); +@iso_8859_9_compat = (1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1); +@iso_8859_10_compat =(1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,1,0,1,0,1,1,1,1,0,0,1,1,1,1,0,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,0,1,0,1,0,1,1,1,1,0,0,1,1,1,1,0,1,0,1,1,1,1,1,0); +@iso_8859_13_compat =(1,0,1,1,1,0,1,1,0,1,0,1,1,1,1,0,1,1,1,1,0,1,1,1,0,1,0,1,1,1,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,1,0,0,0); +@iso_8859_14_compat =(1,0,0,1,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1); +@iso_8859_15_compat =(1,1,1,1,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1); +@iso_8859_16_compat =(1,0,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,0,1,0,0,1,1,1,1,0,0,1); + +# Check destination character encoding +my $recode = 0; +if (open WMLRC, "$language/.wmlrc") +{ + while (<WMLRC>) + { + if (s/^-D CHARSET=//) + { + $recode = 1 unless /^iso-8859-1$/i; + if ($recode && /^iso-8859-([0-9]+)$/) + { + my $compattablename = 'iso_8859_' . $1 . '_compat'; + $compat = \@{$compattablename} if defined @{$compattablename}; + } + last; + } + } +} + # Loop over command line foreach $page (@ARGV) { # Check if valid source if ($page =~ /wml$/) { - ©($page, $recode); + ©($page, $recode, $compat); } else { @@ -98,6 +127,7 @@ sub copy { my $page = shift; my $recodelatin1 = shift; + my $compattable = shift; print "Processing $page...\n"; # Remove english/ from path @@ -215,9 +245,15 @@ sub copy } else { - if ($recodelatin1) + if (defined $compattable) + { + # Recode non-ASCII characters that are not identical to + # ISO 8859-1 into entitites + s/([\xA0-\xFF])/$$compattable[ord($1)-160]?$1:$entities[ord($1)-160]/ge; + } + elsif ($recodelatin1) { - # Recode any non-ASCII characters as entities + # Recode any non-ASCII characters into entities s/([\xA0-\xFF])/$entities[ord($1)-160]/ge; } |