[CWB] [PATCH] Fix CWB-CL and cl_canonical function call

Alberto Simões ambs at di.uminho.pt
Wed Jan 5 18:26:24 CET 2011


Hello

cl_canonical function call now receives an extra argument (charset). 
This patch fixes it and, of course, breaks compatibility to all other 
code that uses CWB::CL normalize method (but you have no other option, I 
guess).

Yet again, this is a patch based on the C function call signature: 
adding an extra argument with the charset.

Given that this method is called on an corpus object, it would be better 
if the corpus object had its charset information somewhere, and used it 
on all subsequent calls to normalize.

At the moment, added a string argument to normalize, the name of the 
charset.

Patch is in attach.

I can supply more in depth patches, changing as described above, but I 
do not want to mess much with the code without the knowledge on if it 
will be used or not :)

Thanks
ambs
-- 
Alberto Simões
-------------- next part --------------
Index: t/02_utility_functions.t
===================================================================
--- t/02_utility_functions.t	(revision 199)
+++ t/02_utility_functions.t	(working copy)
@@ -35,8 +35,8 @@
 our @case_folded=qw(baum baum bäume bäüme déjà déjà oû);
 our @diac_folded=qw(Baum baum Baume BAUME deja DEJA OU);
 our @both_folded=qw(baum baum baume baume deja deja ou);
-is_deeply([$C->normalize("c", @orig)], \@case_folded, "case-folding of strings"); # T9
-is_deeply([$C->normalize("d", @orig)], \@diac_folded, "accent-folding of strings");
-is_deeply([$C->normalize("cd", @orig)], \@both_folded, "case+accent-folding of strings");
+is_deeply([$C->normalize("latin1", "c", @orig)], \@case_folded, "case-folding of strings"); # T9
+is_deeply([$C->normalize("latin1", "d", @orig)], \@diac_folded, "accent-folding of strings");
+is_deeply([$C->normalize("latin1", "cd", @orig)], \@both_folded, "case+accent-folding of strings");
 
-# total: 11 tests
\ No newline at end of file
+# total: 11 tests
Index: lib/CWB/CL.pm
===================================================================
--- lib/CWB/CL.pm	(revision 199)
+++ lib/CWB/CL.pm	(working copy)
@@ -442,14 +442,47 @@
   }
 }
 
-sub normalize ( $$;@ ) {
-  my $self = shift;
-  my $flags = shift;
+sub normalize ( $$$;@ ) {
+  my $self    = shift;
+  my $charset = shift;
+  my $flags   = shift;
+  #### FIXME -- XXX
+  # 
+  # given that we have a corpus, probably we want to have
+  # the charset info stored there, and do not ask it to the user
+  # everytime. For now this seems a possible solution in the
+  # sense that it works :)
+  
+  my %charsets = (
+	ascii    => 0,
+        latin1   => 1,
+        latin2   => 2,
+        latin3   => 3,
+        latin4   => 4,
+        cyrillic => 5,
+        arabic   => 6,
+        greek    => 7,
+        hebrew   => 8,
+        latin5   => 9,
+        latin6   => 10,
+        latin7   => 11,
+        latin8   => 12,
+        latin9   => 13,
+        utf8     => 14,
+        unknown  => 15);
+  # Clean up charset name
+  $charset = lc($charset);
+  $charset =~ s/[-_]//g;
+  # defining ASCII as default charset. Change here
+  $charset = defined($charsets{$charset}) ?
+                      $charsets{$charset} : $charsets{ascii};
 
   croak "Usage:  \$corpus->normalize(('c' | 'd' | 'cd'), \$string, ...);"
     unless $flags =~ /^(c?d?|dc)$/;
 
-  return CWB::CL::cl_normalize($self->{'ptr'}, $CWB::CL::RegexFlags{$flags}, @_);
+  return CWB::CL::cl_normalize($self->{'ptr'}, 
+                               $charset,
+                               $CWB::CL::RegexFlags{$flags}, @_);
 }
 
 
Index: CL.xs
===================================================================
--- CL.xs	(revision 199)
+++ CL.xs	(working copy)
@@ -278,20 +278,21 @@
     RETVAL
 
 void
-cl_normalize(corpus, flags, ...)
-    Corpus*   corpus
-    int       flags
+cl_normalize(corpus, charset, flags, ...)
+    Corpus*  corpus
+    int      charset
+    int      flags
   PREINIT:
     int i, id, size;
     char *s_orig, *s_norm;
     SV *s_arg;
   PPCODE:
     last_cl_error = CDA_OK;
-    size = items - 2;
+    size = items - 3;
     if (size > 0) {
       EXTEND(sp, size);
       for (i = 0; i < size; i++) {
-        s_arg = ST(i+2);
+        s_arg = ST(i+3);
         if (!SvOK(s_arg)) {
           last_cl_error = CWB_CL_INVALID_ARG;
           PUSHs(sv_newmortal()); /* undef ID arguments return undef */
@@ -299,7 +300,7 @@
         else {
           s_orig = (char *) SvPV_nolen(s_arg);
           s_norm = cl_strdup(s_orig);
-          cl_string_canonical(s_norm, flags);
+          cl_string_canonical(s_norm, charset, flags);
           PUSHs(sv_2mortal(newSVpv(s_norm, 0)));
           cl_free(s_norm);
         }


More information about the CWB mailing list