From 52abc68b11694d2360e119543b876cf3c5768fbe Mon Sep 17 00:00:00 2001 From: Tomas Salfischberger Date: Mon, 2 May 2005 15:05:07 +0000 Subject: Dictionary conversion tools. git-svn-id: svn://svn.rockbox.org/rockbox/trunk@6395 a1c6a512-1295-4272-9138-f99709370657 --- tools/FILES | 1 + tools/Makefile | 5 ++- tools/rdf2binary.c | 89 ++++++++++++++++++++++++++++++++++++++ tools/wn2rdf.pl | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 216 insertions(+), 1 deletion(-) create mode 100644 tools/rdf2binary.c create mode 100644 tools/wn2rdf.pl (limited to 'tools') diff --git a/tools/FILES b/tools/FILES index 2e2d232160..14cdeddd41 100644 --- a/tools/FILES +++ b/tools/FILES @@ -10,6 +10,7 @@ rockbox-style.el sample.emacs buildzip.pl romsizetest.pl +wn2rdf.pl make.inc makesrc.inc fwpatcher/*.[ch] diff --git a/tools/Makefile b/tools/Makefile index b98c269642..d8b1545015 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -9,7 +9,7 @@ CFLAGS := -O -ansi -g LDFLAGS := -g -TARGETS := scramble descramble sh2d bmp2rb convbdf generate_rocklatin mkboot +TARGETS := scramble descramble sh2d bmp2rb rdf2binary convbdf generate_rocklatin mkboot all: $(TARGETS) @echo "tools done" @@ -26,6 +26,9 @@ sh2d: sh2d.c bmp2rb: bmp2rb.c $(CC) -DAPPLICATION_NAME=\"$@\" -g $+ -o $@ +rdf2binary: rdf2binary.c + $(CC) -g $+ -o $@ + mkboot: mkboot.c $(CC) -g $+ -o $@ diff --git a/tools/rdf2binary.c b/tools/rdf2binary.c new file mode 100644 index 0000000000..3597efa727 --- /dev/null +++ b/tools/rdf2binary.c @@ -0,0 +1,89 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id$ + * + * Copyright (C) 2005 Miika Pekkarinen + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +/* +This tool converts the rdf file to the binary data used in the dict plugin. +*/ + +#include +#include +#include +#include +#include + +/* maximum word lenght, has to be the same in dict.c */ +#define WORDLEN 32 + +struct word { + char word[WORDLEN]; + long offset; +}; + +int main() +{ + FILE *in; + int idx_out, desc_out; + struct word w; + char buf[10000]; + long cur_offset = 0; + + in = fopen("dict.preparsed", "r"); + idx_out = open("dict.index", O_WRONLY | O_CREAT); + desc_out = open("dict.desc", O_WRONLY | O_CREAT); + + if (in == NULL || idx_out < 0 || desc_out < 0) { + fprintf(stderr, "Error: Some files couldn't be opened\n"); + return 1; + } + + while (fgets(buf, sizeof buf, in) != NULL) { + /* It is safe to use strtok here */ + const char *word = strtok(buf, "\t"); + const char *desc = strtok(NULL, "\t"); + + if (word == NULL || desc == NULL) { + fprintf(stderr, "Parse error!\n"); + fprintf(stderr, "word: %s\ndesc: %s\n", word, desc); + + return 2; + } + + /* We will null-terminate the words */ + strncpy(w.word, word, WORDLEN - 1); + w.offset = cur_offset; + write(idx_out, &w, sizeof(struct word)); + + while (1) { + int len = strlen(desc); + cur_offset += len; + write(desc_out, desc, len); + + desc = strtok(NULL, "\t"); + if (desc == NULL) + break ; + + cur_offset++; + write(desc_out, "\n", 1); + + } + } + + return 0; +} + diff --git a/tools/wn2rdf.pl b/tools/wn2rdf.pl new file mode 100644 index 0000000000..2fff87d66b --- /dev/null +++ b/tools/wn2rdf.pl @@ -0,0 +1,122 @@ +#! /usr/bin/perl -w + +# Wordnet dictionary database converter +# +# Converts the Wordnet prolog data to rockbox dictionary format. +# +# Written by Miika Pekkarinen +# +# $Id$ + +use strict; + +# Lookup tables +my %words; +my %descriptions; + +sub getcatname { + my ($id) = @_; + + return 'N' if $id == 1; + return 'V' if $id == 2; + return 'A' if $id == 3; + return 'A' if $id == 4; + return '?'; +} + +open IN_WORD, "wn_s.pl" or die "Open fail(#1): $!"; +open IN_DESC, "wn_g.pl" or die "Open fail(#2): $!"; +open OUTPUT, "> dict.preparsed" or die "Open fail(#3): $!"; + +print "Reading word file...\n"; + +# Read everything into memory +while () { + chomp ; + + # s(100001740,1,'entity',n,1,11). => 100001740,1,'entity',n,1,11 + s/(^s\()(.*)(\)\.$)/$2/; + + my ($seqid, $n1, $word, $n2, $n3, $n4) = split /,/, $_, 6; + + # 'entity' => entity + $word =~ s/(^\')(.*)(\'$)/$2/; + $word =~ s/\'\'/\'/s; + + my $category = substr $seqid, 0, 1; + + $words{lc $word}{$seqid} = $category; +} + +close IN_WORD; + +print "Reading description file...\n"; +while () { + chomp ; + + # g(100002056,'(a separate and self-contained entity)'). + # => 100002056,'(a separate and self-contained entity)' + s/(^g\()(.*)(\)\.$)/$2/; + + my ($seqid, $desc) = split /,/, $_, 2; + + $desc =~ s/(^\'\()(.*)(\)\'$)/$2/; + $desc =~ s/\'\'/\'/s; + + $descriptions{$seqid} = $desc; +} + +close IN_DESC; + +print "Sorting and writing output...\n"; + +# Now sort and find correct descriptions +foreach my $word (sort keys %words) { + my %categories; + + # Find all definitions of the word + foreach my $id (keys %{$words{$word}}) { + my $catid = $words{$word}{$id}; + my $description = $descriptions{$id}; + + if (!defined($description) or $description eq '') { + print "Error: Failed to link word: $word / ", + $words{$word}, "\n"; + exit 1; + } + + push @{$categories{$catid}}, $description; + } + + my $finaldesc; + + # 1 = noun + # 2 = verb + # 3 = adjective + # 4 = adverb + for my $catid (1 .. 4) { + my $n = 1; + my $catdesc; + + next unless $categories{$catid}; + foreach my $desc ( @{$categories{$catid}} ) { + $catdesc .= " " if $catdesc; + $catdesc .= "$n. $desc"; + $n++; + } + + next unless $catdesc; + $finaldesc .= "\t" if $finaldesc; + $finaldesc .= getcatname($catid) . ": $catdesc" + } + + die "Internal error" unless $finaldesc; + + print OUTPUT "$word\t$finaldesc\n"; +} + +close OUTPUT; + +print "Done, output was successfully written!\n"; + + -- cgit