[CWB] Error #1300 generating word frequency lists

José Manuel Martínez Martínez chozelinek at gmail.com
Wed Aug 29 08:54:04 CEST 2018


Hi Andrew,

thanks for your answer. I've modified the `db.inc.php` like this:

This is the diff of my modifications:

Index: lib/db.inc.php

===================================================================

--- lib/db.inc.php (revision 1057)

+++ lib/db.inc.php (working copy)

@@ -401,7 +401,7 @@

  primary key(refnumber),

  key(text_id) $extra_sql_keys



- ) CHARACTER SET utf8 COLLATE utf8_bin";

+ ) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci";

  /*

  * note the use of a binary collation for distribution DBs, since

  * they always contain handle IDs, not word or tag material.

@@ -478,7 +478,7 @@

  `$att` varchar(40) NOT NULL";

  $create_statement .= ",

  key (refnumber)

- ) CHARACTER SET utf8 COLLATE {$Corpus->sql_collation}";

+ ) CHARACTER SET utf8mb4 COLLATE {$Corpus->sql_collation}";



  break;



@@ -533,7 +533,7 @@

  endPosition int unsigned NOT NULL,

  refnumber mediumint unsigned NOT NULL AUTO_INCREMENT,

  primary key(refnumber)

- ) CHARACTER SET utf8 COLLATE {$Corpus->sql_collation}";

+ ) CHARACTER SET utf8mb4 COLLATE {$Corpus->sql_collation}";



  break;



@@ -551,7 +551,7 @@

  category varchar(40),

  primary key(refnumber),

  key(category)

- ) CHARACTER SET utf8 COLLATE {$Corpus->sql_collation}";

+ ) CHARACTER SET utf8mb4 COLLATE {$Corpus->sql_collation}";



  break;



Index: lib/freqtable.inc.php

===================================================================

--- lib/freqtable.inc.php (revision 1057)

+++ lib/freqtable.inc.php (working copy)

@@ -192,7 +192,7 @@

  global $Config;

  global $Corpus;

  global $User;

-

+ php_execute_time_unlimit();

  global $cqp;



  if (empty($cqp))

Index: lib/indexforms-queries.inc.php

===================================================================

--- lib/indexforms-queries.inc.php (revision 1057)

+++ lib/indexforms-queries.inc.php (working copy)

@@ -74,7 +74,7 @@

  'sq_case'   => 'Simple query (case-sensitive)',

  );

  if (! array_key_exists($qmode, $modemap) )

- $qmode = ($Corpus->uses_case_sensitivity ? 'sq_case' : 'sq_nocase');

+ $qmode = ($Corpus->uses_case_sensitivity ? 'sq_case' : 'cqp');

  /* includes NULL, empty */



But I'm getting this error when I try to run collocations:

A MySQL query did not run successfully!





Error # 1253: COLLATION 'utf8_general_ci' is not valid for CHARACTER SET
'utf8mb4'

I guess that I need to touch the code in more places, and probably do that
on a test environment (I put the change in production ;-)

With this command grep -rn . -e 'utf8_'  I get this list of files that
seems to contain mentions to UTF8:

./bin/upgrade-database.php:104:$Config->mysql_*utf8_*set_required =
(isset($mysql_*utf8_*set_required) && $mysql_*utf8_*set_required);

./bin/upgrade-database.php:225: ) ENGINE=InnoDB CHARACTER SET utf8 COLLATE
*utf8_*bin',

./bin/upgrade-database.php:231: ) ENGINE=InnoDB CHARACTER SET utf8 COLLATE
*utf8_*bin',

./bin/upgrade-database.php:265: (`corpus` varchar(20) NOT NULL,`target`
varchar(20) NOT NULL) ENGINE=InnoDB CHARACTER SET utf8 COLLATE *utf8_*bin',

./bin/upgrade-database.php:313:                    ) ENGINE=InnoDB
CHARACTER SET utf8 COLLATE *utf8_*bin',

./bin/upgrade-database.php:382: ) ENGINE=InnoDB CHARACTER SET utf8 COLLATE
*utf8_*bin"

./bin/upgrade-database.php:401:         ) ENGINE=InnoDB CHARACTER SET utf8
COLLATE *utf8_*bin',

./bin/upgrade-database.php:411:         ) ENGINE=InnoDB CHARACTER SET utf8
COLLATE *utf8_*bin'

./bin/upgrade-database.php:665: ) CHARACTER SET utf8 COLLATE *utf8_*bin'

./bin/upgrade-database.php:690: ) CHARACTER SET utf8 COLLATE *utf8_*bin",

./bin/upgrade-database.php:698: ) CHARACTER SET utf8 COLLATE *utf8_*bin",

./bin/upgrade-database.php:739: 'alter table `annotation_metadata` collate
*utf8_*bin',

./bin/upgrade-database.php:740: 'alter table `annotation_template_info`
collate *utf8_*bin',

./bin/upgrade-database.php:741: 'alter table `annotation_template_content`
collate *utf8_*bin',

./bin/upgrade-database.php:742: 'alter table `corpus_metadata_variable`
collate *utf8_*bin',

./bin/upgrade-database.php:743: 'alter table `saved_dbs` collate *utf8_*
bin',

./bin/upgrade-database.php:744: 'alter table `saved_freqtables` collate
*utf8_*bin',

./bin/upgrade-database.php:745: 'alter table `saved_subcorpora` collate
*utf8_*bin',

./bin/upgrade-database.php:746: 'alter table `user_memberships` collate
*utf8_*bin',

./bin/upgrade-database.php:747: 'alter table `user_privilege_info` collate
*utf8_*bin',

./bin/upgrade-database.php:748: 'alter table `query_history` collate *utf8_*
bin',

./bin/upgrade-database.php:749: 'alter table `system_processes` collate
*utf8_*bin',

./bin/upgrade-database.php:750: 'alter table `text_metadata_fields` collate
*utf8_*bin',

./bin/upgrade-database.php:751: 'alter table `text_metadata_values` collate
*utf8_*bin',

./bin/upgrade-database.php:752: 'alter table `user_info` collate *utf8_*
bin',

./bin/upgrade-database.php:753: /* using *utf8_*bin for user_info implies
the following for specific columnss: */

./bin/upgrade-database.php:754: 'alter table `user_info` modify column
`affiliation` varchar(255) CHARACTER SET utf8 COLLATE *utf8_*general_ci
default NULL',

./bin/upgrade-database.php:755: 'alter table `user_info` modify column
`email` varchar(255) CHARACTER SET utf8 COLLATE *utf8_*general_ci  default
NULL',

./bin/upgrade-database.php:756: 'alter table `user_info` modify column
`realname` varchar(255) CHARACTER SET utf8 COLLATE *utf8_*general_ci
default NULL',

./bin/upgrade-database.php:780: ) CHARACTER SET utf8 COLLATE *utf8_*bin",

./bin/upgrade-database.php:789: ) CHARACTER SET utf8 COLLATE *utf8_*bin",

./bin/upgrade-database.php:794: ) CHARACTER SET utf8 COLLATE *utf8_*bin",

./bin/upgrade-database.php:802: ) CHARACTER SET utf8 COLLATE *utf8_*bin",

./bin/upgrade-database.php:884:         ) CHARACTER SET utf8 COLLATE *utf8_*
bin",

./bin/upgrade-database.php:891:          ) CHARACTER SET utf8 COLLATE
*utf8_*bin"

./bin/upgrade-database.php:953: 'alter table user_info modify column
`username` varchar(30) charset utf8 collate *utf8_*bin NOT NULL',

./bin/upgrade-database.php:959: ) CHARACTER SET utf8 COLLATE *utf8_*bin',

./bin/upgrade-database.php:968: ) CHARACTER SET utf8 COLLATE *utf8_*
general_ci',

./bin/upgrade-database.php:973: ) CHARACTER SET utf8 COLLATE *utf8_*
general_ci'

./bin/upgrade-database.php:1085:   setting_name varchar(20) NOT NULL
collate *utf8_*bin,

./bin/upgrade-database.php:1088: ) CHARACTER SET utf8 COLLATE *utf8_*
general_ci',

./bin/upgrade-database.php:1100:   `group_name` varchar(20) NOT NULL UNIQUE
COLLATE *utf8_*bin,

./bin/upgrade-database.php:1104: ) CHARACTER SET utf8 COLLATE *utf8_*
general_ci',

./bin/upgrade-database.php:1107: CHARACTER SET utf8 COLLATE *utf8_*
general_ci',

./bin/upgrade-database.php:1170: ) CHARACTER SET utf8 COLLATE *utf8_*bin",

./bin/upgrade-database.php:1181: ) CHARACTER SET utf8 COLLATE *utf8_*
general_ci",

./bin/upgrade-database.php:1184: CHARACTER SET utf8 COLLATE *utf8_*
general_ci",

./bin/upgrade-database.php:1187: CHARACTER SET utf8 COLLATE *utf8_*
general_ci",

./bin/autosetup.php:69:$Config->mysql_*utf8_*set_required = $mysql_*utf8_*
set_required;

./lib/admin-lib.inc.php:478: ) CHARSET utf8 COLLATE *utf8_*bin ");

./lib/concordance-post.inc.php:601: $extra_sort_pos_sql .= ", before$i
COLLATE *utf8_*general_ci ";

./lib/concordance-post.inc.php:607: $extra_sort_pos_sql = ", after1 COLLATE
*utf8_*general_ci"

./lib/concordance-post.inc.php:608: . ", after2 COLLATE *utf8_*general_ci"

./lib/concordance-post.inc.php:609: . ", after3 COLLATE *utf8_*general_ci"

./lib/concordance-post.inc.php:610: . ", after4 COLLATE *utf8_*general_ci"

./lib/concordance-post.inc.php:611: . ", after5 COLLATE *utf8_*general_ci";

./lib/concordance-post.inc.php:617: $extra_sort_pos_sql .= ", after$i
COLLATE *utf8_*general_ci";

./lib/concordance-post.inc.php:649: ORDER BY $sort_position_sql COLLATE
*utf8_*general_ci  $extra_sort_pos_sql ";

./lib/concordance-post.inc.php:651: * we always use *utf8_*general_ci for
the actual sorting,

./lib/concordance-post.inc.php:652: * even if the collation of the sort DB
is actually *utf8_*bin

./lib/db.inc.php:278: $*utf8_*filename = $tabfile .'.utf8.tmp';

./lib/db.inc.php:281:                     $*utf8_*filename,

./lib/db.inc.php:286: rename($*utf8_*filename, $tabfile);

./lib/library.inc.php:180: if ($Config->mysql_*utf8_*set_required)

./lib/library.inc.php:549: return $corpus_info->uses_case_sensitivity ? '
*utf8_*bin' : '*utf8_*general_ci' ;

./lib/defaults.inc.php:80: METADATA_TYPE_CLASSIFICATION => 'varchar(255)
default NULL COLLATE *utf8_*bin',

./lib/defaults.inc.php:81: METADATA_TYPE_FREETEXT       => 'text default
NULL COLLATE *utf8_*general_ci',

./lib/defaults.inc.php:82: METADATA_TYPE_IDLINK         => 'varchar(255)
default NULL COLLATE *utf8_*bin',

./lib/defaults.inc.php:83: METADATA_TYPE_UNIQUE_ID      => 'varchar(255)
default NULL COLLATE *utf8_*bin',

./lib/defaults.inc.php:84: METADATA_TYPE_DATE           => 'varchar(255)
default NULL COLLATE *utf8_*bin',

./lib/defaults.inc.php:224:if (!isset($mysql_*utf8_*set_required))

./lib/defaults.inc.php:225: $mysql_*utf8_*set_required = true;

./lib/sql-definitions.inc.php:154: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:166: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:175: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:187: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:200: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:209: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:216: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:225: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*general_ci";

./lib/sql-definitions.inc.php:307: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*general_ci";

./lib/sql-definitions.inc.php:316: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:327: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:340: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:349: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:359: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:376: ) $engine CHARACTER SET utf8 collate
*utf8_*bin";

./lib/sql-definitions.inc.php:389: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:410: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:424: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:437: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:447: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:474: ) $engine_if_fulltext_key_needed
CHARACTER SET utf8 COLLATE *utf8_*bin";

./lib/sql-definitions.inc.php:489: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:503: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:508: setting_name varchar(20) NOT NULL
collate *utf8_*bin,

./lib/sql-definitions.inc.php:511: ) CHARACTER SET utf8 COLLATE
*utf8_*general_ci";
/* note that for this one we don't care about the engine */

./lib/sql-definitions.inc.php:520: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:528: `content` text character set utf8
collate *utf8_*bin,

./lib/sql-definitions.inc.php:531: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*general_ci";

./lib/sql-definitions.inc.php:541: ) CHARACTER SET utf8 COLLATE *utf8_*bin";
/* note that for this one we don't care about the engine */

./lib/sql-definitions.inc.php:551: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:563: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:572: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:582: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:590: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*general_ci";

./lib/sql-definitions.inc.php:598: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*general_ci";

./lib/sql-definitions.inc.php:604: `group_name` varchar(20) NOT NULL UNIQUE
COLLATE *utf8_*bin,

./lib/sql-definitions.inc.php:608: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*general_ci";

./lib/sql-definitions.inc.php:615: `realname` varchar(255) CHARACTER SET
utf8 COLLATE *utf8_*general_ci default NULL,

./lib/sql-definitions.inc.php:616: `email` varchar(255) CHARACTER SET utf8
COLLATE *utf8_*general_ci default NULL,

./lib/sql-definitions.inc.php:617: `affiliation` varchar(255) CHARACTER SET
utf8 COLLATE *utf8_*general_ci default NULL,

./lib/sql-definitions.inc.php:643: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:655: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:663: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:673: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:686: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:698: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:706: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:717: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/sql-definitions.inc.php:733: ) $engine CHARACTER SET utf8 COLLATE
*utf8_*bin";

./lib/freqtable.inc.php:110: $*utf8_*filename = $filename .'.utf8.tmp';

./lib/freqtable.inc.php:113:                     $*utf8_*filename,

./lib/freqtable.inc.php:118: rename($*utf8_*filename, $filename);

./lib/freqtable.inc.php:326: $*utf8_*filename = $master_table_loadfile
.'.utf8.tmp';

./lib/freqtable.inc.php:329:                     $*utf8_*filename,

./lib/freqtable.inc.php:334: rename($*utf8_*filename,
$master_table_loadfile);

./lib/freqtable.inc.php:692: CHARACTER SET utf8 COLLATE *utf8_*bin";

./lib/indexforms-adminhome.inc.php:1838: $result = do_mysql_query("select
username from user_info order by username collate *utf8_*general_ci");

./lib/indexforms-adminhome.inc.php:1980: where username collate
*utf8_*general_ci
like '%$term%'

./lib/indexforms-adminhome.inc.php:1981: or email collate *utf8_*general_ci
like '%$term%'

./lib/indexforms-adminhome.inc.php:1982: or realname collate *utf8_*general_ci
like '%$term%'

--
José Manuel Martínez Martínez
https://chozelinek.github.io


On Thu, Aug 9, 2018 at 12:57 AM Hardie, Andrew <a.hardie at lancaster.ac.uk>
wrote:

> Oh, this again! Stefan has run into this problem with some of the datasets
> collected at FAU if I recall correctly.
>
>
>
> Yes, moving to utf8mb4 is the only solution. I am going to shift the whole
> of CQPweb across to the mb4 variant at the next major upgrade.  (At the
> same time I’ll explicitly declare the shorter fields that contain only
> handle references to ASCII only.)
>
>
>
> You’ll find this problem repeats if any of these 4-byte characters fall
> into a collocation window or if you attempt to sort a concordance that has
> them, I’m afraid. To prevent it, hack the table-create code in db.inc.php
> and add mb4 to the charset where necessary.
>
>
>
> And in the meantime we can collectively curse whichever genius invented a
> partial-Unicode string format  and had the gall to call it “utf8”
>
>
>
> best
>
>
>
> Andrew.
>
>
>
> *From:* cwb-bounces at sslmit.unibo.it <cwb-bounces at sslmit.unibo.it> *On
> Behalf Of *José Manuel Martínez Martínez
> *Sent:* 08 August 2018 09:05
> *To:* Open source development of the Corpus WorkBench <cwb at sslmit.unibo.it
> >
> *Subject:* Re: [CWB] Error #1300 generating word frequency lists
>
>
>
> Hi, Andrew,
>
>
>
> I think I found the root of the problem. My VRT files contain characters
> that are valid UTF-8 however, MySQL's UTF-8 encoding and collation is a
> subset of the full UTF-8. In order to have the full character set one needs
> to use utf8mb4.
>
>
>
> After some testing I found the files giving problems, and I think that all
> of them contained some kind of character out of the subset used by MySQL.
>
>
>
> We need to be sure that the database will use utf8mb4 instead of utf8 as
> character encoding and collation. See <
> https://mathiasbynens.be/notes/mysql-utf8mb4> or first answer here <
> https://stackoverflow.com/questions/22572558/how-to-set-character-set-database-and-collation-database-to-utf8-in-my-ini>
> and the third answer here could be relevant from the Python side <
> https://stackoverflow.com/questions/26532722/how-to-encode-utf8mb4-in-python
> >.
>
>
>
> If we wanted the tables to use the encoding and collation we would need to
> change the CQPweb's code. However, a change from utf8 to utf8mb4 is not
> trivial because the length of `char`, `varchar` and `handles` are affected
> (as we use 4 bytes for every character instead of 3 the size in characters
> of those types of variables is reduced). Having said that, I did not need
> to mess with the tables, it was enough to change some global configuration
> and the charset and collation of the database.
>
>
>
> However, my issue was solved just by doing the following:
>
>
>
> In mysql configuration file `/etc/mysql/my.cnf` I wrote:
>
>
>
> ```sql
>
> [client]
>
> default-character-set = utf8mb4
>
>
>
> [mysql]
>
> default-character-set = utf8mb4
>
>
>
> [mysqld]
>
> character-set-client-handshake = FALSE
>
> character-set-server = utf8mb4
>
> collation-server = utf8mb4_unicode_ci
>
> ```
>
>
>
> And then I also modified the character set and the collation for cqpweb_db:
>
>
>
> ```sql
>
> ALTER DATABASE cqpweb_db CHARACTER SET = utf8mb4 COLLATE =
> utf8mb4_unicode_ci;
>
> ```
>
>
>
> Check in mysql console with:
>
>
>
> ```sql
>
> SHOW VARIABLES WHERE Variable_name LIKE 'character\_set\_%' OR
> Variable_name LIKE 'collation%';
>
> ```
>
>
>
> If one is creating the database from scratch, one could use:
>
>
>
> ```sql
>
> CREATE DATABASE cqpweb_db2 DEFAULT CHARSET utf8mb4 COLLATE
> utf8mb4_general_ci;
>
> ```
>
>
>
> After modifying the MySQL configuration file and changing the character
> set and collation for the database (I did not change anything for the
> tables), CQPweb was able to generate the frequency lists without problems.
>
>
>
> I couldn't say if this is a critical issue. I never had this problem
> before, because I used to normalize characters. Now, I'm working with very
> heterogenous data. I can foresee problems if someone is working with emojis
> and the like (tweets, etc.).
>
>
>
> Cheers,
>
>
>
> jmm
>
>
>
>
> --
>
> José Manuel Martínez Martínez
>
> https://chozelinek.github.io
>
>
>
> On Mon, Aug 6, 2018 at 3:01 PM, Hardie, Andrew <a.hardie at lancaster.ac.uk>
> wrote:
>
> >> is it possible to add a new corpus from the command line?
>
>
>
> Not yet.
>
>
>
> >> I've seen a create-corpus.php script but it says //TODO
>
>
>
> Precisely!
>
>
>
> *From:* cwb-bounces at sslmit.unibo.it <cwb-bounces at sslmit.unibo.it> *On
> Behalf Of *José Manuel Martínez Martínez
> *Sent:* 06 August 2018 13:30
>
>
> *To:* Open source development of the Corpus WorkBench <cwb at sslmit.unibo.it
> >
> *Subject:* Re: [CWB] Error #1300 generating word frequency lists
>
>
>
> Hi again,
>
>
>
> last question, is it possible to add a new corpus from the command line?
> Not only the generation of the frequency lists? I've seen a
> create-corpus.php script but it says //TODO ;-)
>
>
>
> And just in case it helps, this is what I see regarding my MySQL config
>
>
>
> mysql> show VARIABLES like '%collation%';
>
> +----------------------+-------------------+
>
> | Variable_name        | Value             |
>
> +----------------------+-------------------+
>
> | collation_connection | utf8_general_ci   |
>
> | collation_database   | utf8_general_ci   |
>
> | collation_server     | latin1_swedish_ci |
>
> +----------------------+-------------------+
>
> 3 rows in set (0.00 sec)
>
>
>
> mysql> show variables like '%character%';
>
> +--------------------------+----------------------------+
>
> | Variable_name            | Value                      |
>
> +--------------------------+----------------------------+
>
> | character_set_client     | utf8                       |
>
> | character_set_connection | utf8                       |
>
> | character_set_database   | utf8                       |
>
> | character_set_filesystem | binary                     |
>
> | character_set_results    | utf8                       |
>
> | character_set_server     | latin1                     |
>
> | character_set_system     | utf8                       |
>
> | character_sets_dir       | /usr/share/mysql/charsets/ |
>
> +--------------------------+----------------------------+
>
> 8 rows in set (0.00 sec)
>
> SHOW FULL COLUMNS FROM __tempfreq_spanish;
>
>
> +----------+------------------+-----------------+------+-----+---------+-------+---------------------------------+---------+
>
> | Field    | Type             | Collation       | Null | Key | Default |
> Extra | Privileges                      | Comment |
>
>
> +----------+------------------+-----------------+------+-----+---------+-------+---------------------------------+---------+
>
> | freq     | int(11) unsigned | NULL            | YES  |     | NULL    |
>     | select,insert,update,references |         |
>
> | word     | varchar(255)     | utf8_general_ci | NO   | MUL | NULL    |
>     | select,insert,update,references |         |
>
> | dep      | varchar(255)     | utf8_general_ci | NO   | MUL | NULL    |
>     | select,insert,update,references |         |
>
> | ent_type | varchar(255)     | utf8_general_ci | NO   | MUL | NULL    |
>     | select,insert,update,references |         |
>
> | is_alpha | varchar(255)     | utf8_general_ci | NO   | MUL | NULL    |
>     | select,insert,update,references |         |
>
> | is_digit | varchar(255)     | utf8_general_ci | NO   | MUL | NULL    |
>     | select,insert,update,references |         |
>
> | is_oov   | varchar(255)     | utf8_general_ci | NO   | MUL | NULL    |
>     | select,insert,update,references |         |
>
> | lemma    | varchar(255)     | utf8_general_ci | NO   | MUL | NULL    |
>     | select,insert,update,references |         |
>
> | lower    | varchar(255)     | utf8_general_ci | NO   | MUL | NULL    |
>     | select,insert,update,references |         |
>
> | pos      | varchar(255)     | utf8_general_ci | NO   | MUL | NULL    |
>     | select,insert,update,references |         |
>
> | tag      | varchar(255)     | utf8_general_ci | NO   | MUL | NULL    |
>     | select,insert,update,references |         |
>
>
> +----------+------------------+-----------------+------+-----+---------+-------+---------------------------------+---------+
>
> 11 rows in set (0.00 sec)
>
>
>
>
> --
>
> José Manuel Martínez Martínez
>
> https://chozelinek.github.io
>
>
>
> On Mon, Aug 6, 2018 at 1:31 PM, José Manuel Martínez Martínez <
> chozelinek at gmail.com> wrote:
>
> Hi Andrew,
>
>
>
> thanks for the pointers. I didn't mention it, but I'm installing the new
> corpora from already indexed corpora. Just in case this might be relevant.
>
>
>
> I'll check with iconv and also with the generation of the frequency lists.
>
>
>
> Cheers,
>
>
> --
>
> José Manuel Martínez Martínez
>
> https://chozelinek.github.io
>
>
>
> On Mon, Aug 6, 2018 at 12:03 PM, Hardie, Andrew <a.hardie at lancaster.ac.uk>
> wrote:
>
> A record is kept of the messages retrieved during indexing. Run this MySQL
> query to see it:
>
>
>
> SELECT indexing_notes FROM corpus_info WHERE corpus="lowercase corpus
> handle here";
>
>
>
> And you will see all the messages that cwb-encode & friends emitted during
> indexing.
>
>
>
> >> Would be there a way to run from the command line the command to
> generate the frequency lists?
>
>
>
> Yes, see Admin manual section 5.10 (p 48 in the version on the website
> <http://cwb.sourceforge.net/files/CQPwebAdminManual.pdf>)
>
>
>
> That’s just the freqlist. To encode offline, use the cwb binaries.
>
>
>
> But actually, it might be easier to run iconv(1) on your files with UTF-8
> as input encoding, and see whether/where it chokes.
>
>
>
> best
>
>
>
> Andrew.
>
>
>
>
>
> *From:* cwb-bounces at sslmit.unibo.it <cwb-bounces at sslmit.unibo.it> *On
> Behalf Of *José Manuel Martínez Martínez
> *Sent:* 06 August 2018 10:44
> *To:* Open source development of the Corpus WorkBench <cwb at sslmit.unibo.it
> >
> *Subject:* Re: [CWB] Error #1300 generating word frequency lists
>
>
>
> Hi Andrew,
>
>
>
> thank you very much for your quick reply.
>
>
>
> CQPweb v3.2.31
>
> CWB v3.4.14
>
>
>
> The underlying data should be UTF-8.
>
>
>
> I cannot remember right now if I had encoding error at the encoding stage.
>
>
>
> I'll re-encode the corpus and let you know if I get any error on that
> regard.
>
>
>
> Would be there a way to run from the command line the command to generate
> the frequency lists? I think I can leave a script encoding incrementally
> all texts I have in my corpus, to find out at least, which file is
> producing problems.
>
>
>
> Cheers,
>
>
>
>
> --
>
> José Manuel Martínez Martínez
>
> https://chozelinek.github.io
>
>
>
> On Mon, Aug 6, 2018 at 10:10 AM, Hardie, Andrew <a.hardie at lancaster.ac.uk>
> wrote:
>
> The key bit of the error message is this:
>
>
>
> Error # 1300: Invalid utf8 character string: ''
>
>
>
> (unfortunate that the actual bad string can’t be identified from this.)
>
>
>
> This suggests that there is a bad string in the CWB index, and it is
> caught by the MySql db on freq list setup. Recent versions of CWB however
> should not permit the indexing of badly-encoded strings (recent meaning,
> last several years). You should have had an error at the encoding stage if
> there was an encoding error in your data.
>
>
>
> What’s your CWB version? (also your CQPweb version) Also, is the
> underlying data UTF-8 or Latin-1?
>
>
>
> best
>
>
>
> Andrew.
>
>
>
>
>
>
>
> *From:* cwb-bounces at sslmit.unibo.it <cwb-bounces at sslmit.unibo.it> *On
> Behalf Of *José Manuel Martínez Martínez
> *Sent:* 06 August 2018 08:18
> *To:* Open source development of the Corpus WorkBench <cwb at sslmit.unibo.it
> >
> *Subject:* [CWB] Error #1300 generating word frequency lists
>
>
>
> Good morning!
>
>
>
> Trying to run collocations on a corpus in Spanish, I've got an error.
>
>
>
> Somehow, the word frequency list wasn't generated.
>
>
>
> I tried to generate it again but the process fails and I get the traceback
> that I copy/paste below.
>
>
>
> Is this a CQPweb issue or should I check some settings of the MySQL
> database?
>
>
>
> Cheers,
>
>
>
> jmm
>
>
>
> --- TRACEBACK ---
>
>
>
> CQPweb encountered an error and could not continue.
>
>
>
>
>
> A MySQL query did not run successfully!
>
>
>
>
>
>
>
>
>
>
>
> Original query: LOAD DATA LOCAL INFILE
> '/data/cqpweb/tmp/______tempfreq_spanish.tbl' INTO TABLE
> `__tempfreq_spanish` FIELDS ESCAPED BY '' /* from User: datamaran |
> Function: corpus_make_freqtables() | 2018-Aug-03 12:41:27 */
>
>
>
>
>
>
>
>
>
>
>
> Error # 1300: Invalid utf8 character string: ''
>
>
>
>
>
>
>
> PHP debugging backtrace
>
> array(6) {
>
>   [1]=>
>
>   array(4) {
>
>     ["file"]=>
>
>     string(40) "/var/www/html/cqpweb/lib/library.inc.php"
>
>     ["line"]=>
>
>     int(286)
>
>     ["function"]=>
>
>     string(20) "exiterror_mysqlquery"
>
>     ["args"]=>
>
>     array(3) {
>
>       [0]=>
>
>       int(1300)
>
>       [1]=>
>
>       string(33) "Invalid utf8 character string: ''"
>
>       [2]=>
>
>       string(210) "LOAD DATA LOCAL INFILE
> '/data/cqpweb/tmp/______tempfreq_spanish.tbl' INTO TABLE
> `__tempfreq_spanish` FIELDS ESCAPED BY ''
>
>             /* from User: datamaran | Function: corpus_make_freqtables() |
> 2018-Aug-03 12:41:27 */"
>
>     }
>
>   }
>
>   [2]=>
>
>   array(4) {
>
>     ["file"]=>
>
>     string(40) "/var/www/html/cqpweb/lib/library.inc.php"
>
>     ["line"]=>
>
>     int(410)
>
>     ["function"]=>
>
>     string(14) "do_mysql_query"
>
>     ["args"]=>
>
>     array(1) {
>
>       [0]=>
>
>       &string(210) "LOAD DATA LOCAL INFILE
> '/data/cqpweb/tmp/______tempfreq_spanish.tbl' INTO TABLE
> `__tempfreq_spanish` FIELDS ESCAPED BY ''
>
>             /* from User: datamaran | Function: corpus_make_freqtables() |
> 2018-Aug-03 12:41:27 */"
>
>     }
>
>   }
>
>   [3]=>
>
>   array(4) {
>
>     ["file"]=>
>
>     string(42) "/var/www/html/cqpweb/lib/freqtable.inc.php"
>
>     ["line"]=>
>
>     int(124)
>
>     ["function"]=>
>
>     string(21) "do_mysql_infile_query"
>
>     ["args"]=>
>
>     array(3) {
>
>       [0]=>
>
>       string(18) "__tempfreq_spanish"
>
>       [1]=>
>
>       string(43) "/data/cqpweb/tmp/______tempfreq_spanish.tbl"
>
>       [2]=>
>
>       bool(true)
>
>     }
>
>   }
>
>   [4]=>
>
>   array(4) {
>
>     ["file"]=>
>
>     string(42) "/var/www/html/cqpweb/lib/admin-lib.inc.php"
>
>     ["line"]=>
>
>     int(838)
>
>     ["function"]=>
>
>     string(22) "corpus_make_freqtables"
>
>     ["args"]=>
>
>     array(1) {
>
>       [0]=>
>
>       string(7) "spanish"
>
>     }
>
>   }
>
>   [5]=>
>
>   array(4) {
>
>     ["file"]=>
>
>     string(47) "/var/www/html/cqpweb/lib/metadata-admin.inc.php"
>
>     ["line"]=>
>
>     int(179)
>
>     ["function"]=>
>
>     string(40) "create_text_metadata_auto_freqlist_calls"
>
>     ["args"]=>
>
>     array(1) {
>
>       [0]=>
>
>       string(7) "spanish"
>
>     }
>
>   }
>
>   [6]=>
>
>   array(4) {
>
>     ["file"]=>
>
>     string(43) "/var/www/html/cqpweb/exe/metadata-admin.php"
>
>     ["line"]=>
>
>     int(3)
>
>     ["args"]=>
>
>     array(1) {
>
>       [0]=>
>
>       string(47) "/var/www/html/cqpweb/lib/metadata-admin.inc.php"
>
>     }
>
>     ["function"]=>
>
>     string(7) "require"
>
>   }
>
> }
>
>
>
> --
>
> José Manuel Martínez Martínez
>
> https://chozelinek.github.io
>
>
> _______________________________________________
> CWB mailing list
> CWB at sslmit.unibo.it
> http://liste.sslmit.unibo.it/mailman/listinfo/cwb
>
>
>
>
> _______________________________________________
> CWB mailing list
> CWB at sslmit.unibo.it
> http://liste.sslmit.unibo.it/mailman/listinfo/cwb
>
>
>
>
>
>
> _______________________________________________
> CWB mailing list
> CWB at sslmit.unibo.it
> http://liste.sslmit.unibo.it/mailman/listinfo/cwb
>
>
> _______________________________________________
> CWB mailing list
> CWB at sslmit.unibo.it
> http://liste.sslmit.unibo.it/mailman/listinfo/cwb
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://liste.sslmit.unibo.it/pipermail/cwb/attachments/20180829/b2c1964e/attachment-0001.html>


More information about the CWB mailing list