summaryrefslogtreecommitdiffstats
path: root/tools/wn2rdf.pl
diff options
context:
space:
mode:
authorTomas Salfischberger <tomas@rockbox.org>2005-05-02 15:05:07 +0000
committerTomas Salfischberger <tomas@rockbox.org>2005-05-02 15:05:07 +0000
commit52abc68b11694d2360e119543b876cf3c5768fbe (patch)
tree937b8bdc68faccf815efdccf1192c2dd92738932 /tools/wn2rdf.pl
parenta810a67db7c923b01c4135761ef21ab866db256d (diff)
downloadrockbox-52abc68b11694d2360e119543b876cf3c5768fbe.tar.gz
rockbox-52abc68b11694d2360e119543b876cf3c5768fbe.tar.bz2
rockbox-52abc68b11694d2360e119543b876cf3c5768fbe.zip
Dictionary conversion tools.
git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6395 a1c6a512-1295-4272-9138-f99709370657
Diffstat (limited to 'tools/wn2rdf.pl')
-rw-r--r--tools/wn2rdf.pl122
1 files changed, 122 insertions, 0 deletions
diff --git a/tools/wn2rdf.pl b/tools/wn2rdf.pl
new file mode 100644
index 0000000000..2fff87d66b
--- /dev/null
+++ b/tools/wn2rdf.pl
@@ -0,0 +1,122 @@
+#! /usr/bin/perl -w
+
+# Wordnet dictionary database converter
+#
+# Converts the Wordnet prolog data to rockbox dictionary format.
+#
+# Written by Miika Pekkarinen <slasher@ihme.org>
+#
+# $Id$
+
+use strict;
+
+# Lookup tables
+my %words;
+my %descriptions;
+
+sub getcatname {
+ my ($id) = @_;
+
+ return 'N' if $id == 1;
+ return 'V' if $id == 2;
+ return 'A' if $id == 3;
+ return 'A' if $id == 4;
+ return '?';
+}
+
+open IN_WORD, "wn_s.pl" or die "Open fail(#1): $!";
+open IN_DESC, "wn_g.pl" or die "Open fail(#2): $!";
+open OUTPUT, "> dict.preparsed" or die "Open fail(#3): $!";
+
+print "Reading word file...\n";
+
+# Read everything into memory
+while (<IN_WORD>) {
+ chomp ;
+
+ # s(100001740,1,'entity',n,1,11). => 100001740,1,'entity',n,1,11
+ s/(^s\()(.*)(\)\.$)/$2/;
+
+ my ($seqid, $n1, $word, $n2, $n3, $n4) = split /,/, $_, 6;
+
+ # 'entity' => entity
+ $word =~ s/(^\')(.*)(\'$)/$2/;
+ $word =~ s/\'\'/\'/s;
+
+ my $category = substr $seqid, 0, 1;
+
+ $words{lc $word}{$seqid} = $category;
+}
+
+close IN_WORD;
+
+print "Reading description file...\n";
+while (<IN_DESC>) {
+ chomp ;
+
+ # g(100002056,'(a separate and self-contained entity)').
+ # => 100002056,'(a separate and self-contained entity)'
+ s/(^g\()(.*)(\)\.$)/$2/;
+
+ my ($seqid, $desc) = split /,/, $_, 2;
+
+ $desc =~ s/(^\'\()(.*)(\)\'$)/$2/;
+ $desc =~ s/\'\'/\'/s;
+
+ $descriptions{$seqid} = $desc;
+}
+
+close IN_DESC;
+
+print "Sorting and writing output...\n";
+
+# Now sort and find correct descriptions
+foreach my $word (sort keys %words) {
+ my %categories;
+
+ # Find all definitions of the word
+ foreach my $id (keys %{$words{$word}}) {
+ my $catid = $words{$word}{$id};
+ my $description = $descriptions{$id};
+
+ if (!defined($description) or $description eq '') {
+ print "Error: Failed to link word: $word / ",
+ $words{$word}, "\n";
+ exit 1;
+ }
+
+ push @{$categories{$catid}}, $description;
+ }
+
+ my $finaldesc;
+
+ # 1 = noun
+ # 2 = verb
+ # 3 = adjective
+ # 4 = adverb
+ for my $catid (1 .. 4) {
+ my $n = 1;
+ my $catdesc;
+
+ next unless $categories{$catid};
+ foreach my $desc ( @{$categories{$catid}} ) {
+ $catdesc .= " " if $catdesc;
+ $catdesc .= "$n. $desc";
+ $n++;
+ }
+
+ next unless $catdesc;
+ $finaldesc .= "\t" if $finaldesc;
+ $finaldesc .= getcatname($catid) . ": $catdesc"
+ }
+
+ die "Internal error" unless $finaldesc;
+
+ print OUTPUT "$word\t$finaldesc\n";
+}
+
+close OUTPUT;
+
+print "Done, output was successfully written!\n";
+
+