Better accommodations for reading non-UTF-8 PO files

Instead of reading the whole file and only then checking the charset, read just up to the first msgid/msgstr, check if it specifies a charset right away. If it does, imbue the file with the correct charset and read the rest. In particular this: - Avoid unnecessary warnings - Avoid reading the file repeatedly Signed-off-by: Alexander Golubev <fatzer2@gmail.com>
mquinson · Oct 29, 2024 · 180aa20 · 180aa20
1 parent 3266006
commit 180aa20
Show file tree

Hide file tree

Showing 10 changed files with 233 additions and 17 deletions.
diff --git a/lib/Locale/Po4a/Po.pm b/lib/Locale/Po4a/Po.pm
@@ -309,7 +309,7 @@ sub read {
     my $filename = shift
       or croak wrap_mod( "po4a::po", dgettext( "po4a", "Please provide a non-null filename" ) );
 
-    my $charset = shift // 'UTF-8';
+    my $charset = shift // '';
     $charset = 'UTF-8' if $charset eq "CHARSET";
     warn "Read $filename with encoding: $charset" if $debug{'encoding'};
 
@@ -335,43 +335,70 @@ sub read {
     if ( $filename eq '-' ) {
         $fh = *STDIN;
     } else {
-        open( $fh, "<:encoding($charset)", $filename )
+        open( $fh, "<", $filename )
           or croak wrap_mod( "po4a::po", dgettext( "po4a", "Cannot read from %s: %s" ), $filename, $! );
     }
 
-    ## Read paragraphs line-by-line
     my $pofile = "";
+    ## Read the first msgid/msgstr to detect encoding
     while ( defined( my $textline = <$fh> ) ) {
         $pofile .= $textline;
+        last if ( $textline =~ /^msgid/ );
+    }
+    while ( defined( my $textline = <$fh> ) ) {
+        $pofile .= $textline;
+        last if ( $textline =~ /^\s*$/ );
     }
-    $pofile =~ s/\r\n/\n/sg;    # Reading a DOS-encoded file from Linux (native files are handled in all cases)
 
-    # If we did not get the charset right, reload the file with the right one
-    if ( $pofile =~ /charset=(.*?)[\s\\]/ ) {
+    my $is_charset_detected;
+    # Detect the charset
+    if ( $pofile =~ /^msgid ""\s*$/m &&
+         $pofile =~ /^msgstr ""\s*$/m &&
+         $pofile =~ /charset=(.*?)[\s\\]/
+    ) {
         my $detected_charset = $1;
-
-        if ( $detected_charset ne $charset && uc($detected_charset) ne $charset && uc($detected_charset) ne 'CHARSET' )
-        {
-            warn "Reloading the PO file, changing the charset from '$charset' to '$detected_charset'"
-              if $debug{'encoding'};
-            $self->read( $filename, $detected_charset, $checkvalidity );
-            return;
+        if (   $detected_charset ne $charset &&
+            uc($detected_charset) ne $charset &&
+            uc($detected_charset) ne 'CHARSET'
+        ) {
+            warn "Detected '$detected_charset' in the PO file. Using it instead of '$charset'"
+                if $debug{'encoding'};
+            $charset = $detected_charset;
+            $is_charset_detected = 1;
         }
     }
 
+    if (not length $charset) {
+        warn "Failed to autodetect encoding of '$filename' and none was provided. Assuming 'UTF-8'." if $debug{'encoding'};
+        $charset = 'UTF-8';
+    }
     if ( $pofile =~ m/^\N{BOM}/ ) {    # UTF-8 BOM detected
         croak "BOM detected";
         croak wrap_msg(
-            dgettext(
-                "po4a",
-                "The file %s starts with a BOM char indicating that its encoding is UTF-8, but you specified %s instead."
-            ),
+            $is_charset_detected ? dgettext( "po4a",
+                    "The file %s starts with a BOM char indicating that its encoding is UTF-8, but '%s' was detected."
+                ) : dgettext( "po4a",
+                    "The file %s starts with a BOM char indicating that its encoding is UTF-8, but you specified '%s' instead."
+                ),
             $filename,
             $charset
         ) if ( uc($charset) ne 'UTF-8' );
         $pofile =~ s/^\N{BOM}//;
     }
 
+    # Decode already read part of the PO file with the charset
+    $pofile = decode("$charset", $pofile);
+
+    warn "Imbuing PO file '$filename' with '$charset'" if $debug{'encoding'};
+    binmode( $fh, ":encoding($charset)");
+
+    # Reading the rest of the file
+    while ( defined( my $textline = <$fh> ) ) {
+        $pofile .= $textline;
+    }
+
+    $pofile =~ s/\r\n/\n/sg;    # Reading a DOS-encoded file from Linux (native files are handled in all cases)
+
     if ( $filename ne '-' ) {
         close $fh
           or croak wrap_mod( "po4a::po", dgettext( "po4a", "Cannot close %s after reading: %s" ), $filename, $! );

diff --git a/t/charset.t b/t/charset.t
@@ -53,6 +53,13 @@ push @tests,
     'format'  => 'yaml',
     'options' => "-M UTF-8",
     'input'   => "charset/yaml/utf8.yaml",
+  },
+  {
+    'doc'            => 'PO file encoding: iso8859-1',
+    'po4a.conf'      => 'charset/po-iso8859/po4a.conf',
+    'closed_path'    => 'charset/*/',
+    'options'        => '--no-update',
+    'expected_files' => 'utf8.up.pod iso8859.up.pod',
   };
 
 run_all_tests(@tests);

diff --git a/t/charset/po-iso8859/common.pot b/t/charset/po-iso8859/common.pot
@@ -0,0 +1,42 @@
+# SOME DESCRIPTIVE TITLE
+# Copyright (C) YEAR Free Software Foundation, Inc.
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2024-10-29 00:02+0300\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"Language: \n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. type: =head1
+#: utf8.pod:1
+msgid "ASCII-utf8-title"
+msgstr ""
+
+#. type: textblock
+#: utf8.pod:3 iso8859.pod:3
+msgid "common string with diactricks: é ê è â"
+msgstr ""
+
+#. type: textblock
+#: utf8.pod:5
+msgid "utf8 string with diactricks: é ê è â"
+msgstr ""
+
+#. type: =head1
+#: iso8859.pod:1
+msgid "ASCII-iso8859-title"
+msgstr ""
+
+#. type: textblock
+#: iso8859.pod:5
+msgid "iso8859 string with diactricks: é ê è â"
+msgstr ""
diff --git a/t/charset/po-iso8859/iso8859.pod b/t/charset/po-iso8859/iso8859.pod
@@ -0,0 +1,5 @@
+=head1 ASCII-iso8859-title
+
+common string with diactricks: � � � �
+
+iso8859 string with diactricks: � � � �
diff --git a/t/charset/po-iso8859/iso8859.up.po b/t/charset/po-iso8859/iso8859.up.po
@@ -0,0 +1,41 @@
+# Language up translations for po-iso package
+# Copyright (C) 2024 Free Software Foundation, Inc.
+# This file is distributed under the same license as the po-iso package.
+# A comment with diactricks: é ê è
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: po-iso 8859\n"
+"POT-Creation-Date: 2024-10-29 00:02+0300\n"
+"PO-Revision-Date: 2024-10-29 00:02+0300\n"
+"Last-Translator: À Déâctrîc Pérsôn\n"
+"Language-Team: none\n"
+"Language: up\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=iso-8859-1\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. type: =head1
+#: utf8.pod:1
+msgid "ASCII-utf8-title"
+msgstr "ASCII-UTF8-TITLE"
+
+#. type: textblock
+#: utf8.pod:3 iso8859.pod:3
+msgid "common string with diactricks: é ê è â"
+msgstr "COMMON STRING WITH DIACTRICKS: É Ê È Â"
+
+#. type: textblock
+#: utf8.pod:5
+msgid "utf8 string with diactricks: é ê è â"
+msgstr "UTF8 STRING WITH DIACTRICKS: É Ê È Â"
+
+#. type: =head1
+#: iso8859.pod:1
+msgid "ASCII-iso8859-title"
+msgstr "ASCII-ISO8859-TITLE"
+
+#. type: textblock
+#: iso8859.pod:5
+msgid "iso8859 string with diactricks: é ê è â"
+msgstr "ISO8859 STRING WITH DIACTRICKS: É Ê È Â"
diff --git a/t/charset/po-iso8859/iso8859.up.po.1 b/t/charset/po-iso8859/iso8859.up.po.1
@@ -0,0 +1,41 @@
+# Language up translations for po-iso package
+# Copyright (C) 2024 Free Software Foundation, Inc.
+# This file is distributed under the same license as the po-iso package.
+# Automatically generated, 2024.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: po-iso 8859\n"
+"POT-Creation-Date: 2024-10-29 00:02+0300\n"
+"PO-Revision-Date: 2024-10-29 00:02+0300\n"
+"Last-Translator: Automatically generated\n"
+"Language-Team: none\n"
+"Language: up\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. type: =head1
+#: utf8.pod:1
+msgid "ASCII-utf8-title"
+msgstr ""
+
+#. type: textblock
+#: utf8.pod:3 iso8859.pod:3
+msgid "common string with diactricks: é ê è â"
+msgstr ""
+
+#. type: textblock
+#: utf8.pod:5
+msgid "utf8 string with diactricks: é ê è â"
+msgstr ""
+
+#. type: =head1
+#: iso8859.pod:1
+msgid "ASCII-iso8859-title"
+msgstr ""
+
+#. type: textblock
+#: iso8859.pod:5
+msgid "iso8859 string with diactricks: é ê è â"
+msgstr ""
diff --git a/t/charset/po-iso8859/iso8859.up.pod b/t/charset/po-iso8859/iso8859.up.pod
@@ -0,0 +1,20 @@
+
+        *****************************************************
+        *           GENERATED FILE, DO NOT EDIT             *
+        * THIS IS NO SOURCE FILE, BUT RESULT OF COMPILATION *
+        *****************************************************
+
+This file was generated by po4a(7). Do not store it (in VCS, for example),
+but store the PO file used as source file by po4a-translate.
+
+In fact, consider this as a binary, and the PO file as a regular .c file:
+If the PO get lost, keeping this translation up-to-date will be harder.
+
+=encoding ISO-8859-1
+
+=head1 ASCII-ISO8859-TITLE
+
+COMMON STRING WITH DIACTRICKS: � � � �
+
+ISO8859 STRING WITH DIACTRICKS: � � � �
+
diff --git a/t/charset/po-iso8859/po4a.conf b/t/charset/po-iso8859/po4a.conf
@@ -0,0 +1,8 @@
+[po4a_alias:pod_utf8] pod opt:"--master-charset UTF-8 --localized-charset UTF-8"
+[po4a_alias:pod_8859] pod opt:"--master-charset ISO-8859-1 --localized-charset ISO-8859-1"
+[po4a_paths] common.pot up:iso8859.up.po
+
+[type:pod_utf8] utf8.pod up:utf8.up.pod
+[type:pod_8859] iso8859.pod up:iso8859.up.pod
+
+
diff --git a/t/charset/po-iso8859/utf8.pod b/t/charset/po-iso8859/utf8.pod
@@ -0,0 +1,5 @@
+=head1 ASCII-utf8-title
+
+common string with diactricks: é ê è â
+
+utf8 string with diactricks: é ê è â
diff --git a/t/charset/po-iso8859/utf8.up.pod b/t/charset/po-iso8859/utf8.up.pod
@@ -0,0 +1,20 @@
+
+        *****************************************************
+        *           GENERATED FILE, DO NOT EDIT             *
+        * THIS IS NO SOURCE FILE, BUT RESULT OF COMPILATION *
+        *****************************************************
+
+This file was generated by po4a(7). Do not store it (in VCS, for example),
+but store the PO file used as source file by po4a-translate.
+
+In fact, consider this as a binary, and the PO file as a regular .c file:
+If the PO get lost, keeping this translation up-to-date will be harder.
+
+=encoding UTF-8
+
+=head1 ASCII-UTF8-TITLE
+
+COMMON STRING WITH DIACTRICKS: É Ê È Â
+
+UTF8 STRING WITH DIACTRICKS: É Ê È Â
+