added WordNet synonyms extensions

This commit is contained in:
Nikolay Degterinsky 2021-06-03 19:28:12 +00:00
parent 876f51ab95
commit ed12fb5604
27 changed files with 2544 additions and 16 deletions

View File

@ -331,3 +331,4 @@ endif()
add_subdirectory(fast_float)
add_subdirectory(libstemmer-c-cmake)
add_subdirectory(wordnet-blast-cmake)

View File

@ -13,11 +13,12 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
regex
context
coroutine
graph
)
if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND
Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY AND
Boost_COROUTINE_LIBRARY)
Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY)
set(EXTERNAL_BOOST_FOUND 1)
@ -32,6 +33,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
add_library (_boost_system INTERFACE)
add_library (_boost_context INTERFACE)
add_library (_boost_coroutine INTERFACE)
add_library (_boost_graph INTERFACE)
target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY})
target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY})
@ -40,6 +42,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY})
target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY})
target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY})
target_link_libraries (_boost_graph INTERFACE ${Boost_GRAPH_LIBRARY})
add_library (boost::filesystem ALIAS _boost_filesystem)
add_library (boost::iostreams ALIAS _boost_iostreams)
@ -48,6 +51,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
add_library (boost::system ALIAS _boost_system)
add_library (boost::context ALIAS _boost_context)
add_library (boost::coroutine ALIAS _boost_coroutine)
add_library (boost::graph ALIAS _boost_graph)
else()
set(EXTERNAL_BOOST_FOUND 0)
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost")
@ -221,4 +225,16 @@ if (NOT EXTERNAL_BOOST_FOUND)
add_library (boost::coroutine ALIAS _boost_coroutine)
target_include_directories (_boost_coroutine PRIVATE ${LIBRARY_DIR})
target_link_libraries(_boost_coroutine PRIVATE _boost_context)
# graph
set (SRCS_GRAPH
"${LIBRARY_DIR}/libs/graph/src/graphml.cpp"
"${LIBRARY_DIR}/libs/graph/src/read_graphviz_new.cpp"
)
add_library (_boost_graph ${SRCS_GRAPH})
add_library (boost::graph ALIAS _boost_graph)
target_include_directories (_boost_graph PRIVATE ${LIBRARY_DIR})
endif ()

View File

@ -0,0 +1,13 @@
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast")
set(SRCS
"${LIBRARY_DIR}/wnb/core/info_helper.cc"
"${LIBRARY_DIR}/wnb/core/load_wordnet.cc"
"${LIBRARY_DIR}/wnb/core/wordnet.cc"
)
add_library(wnb ${SRCS})
target_link_libraries(wnb PRIVATE boost::headers_only boost::graph)
target_include_directories(wnb PUBLIC "${LIBRARY_DIR}")

View File

@ -0,0 +1 @@
Ugo Jardonnet ugo.jardonnet/gmail

View File

@ -0,0 +1,65 @@
CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
PROJECT(wnb)
# Boost dependency
#--------------------------------------------------
# IF (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
# SET (BOOST_ROOT /Developer/boost_build/) # Suggested path
# ELSE()
# SET (BOOST_ROOT "/usr/include")
# ENDIF()
##############
SET (BOOST_ROOT "${ClickHouse_SOURCE_DIR}/contrib/boost")
##############
MESSAGE(STATUS "** Search Boost root: ${BOOST_ROOT}")
FIND_PACKAGE(Boost 1.70.0 COMPONENTS graph REQUIRED)
MESSAGE(STATUS "** Boost Include: ${Boost_INCLUDE_DIR}")
MESSAGE(STATUS "** Boost Libraries: ${Boost_LIBRARY_DIRS}")
MESSAGE(STATUS "** Boost Libraries: ${Boost_LIBRARIES}")
INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIR})
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
# Project
#--------------------------------------------------
LINK_DIRECTORIES(${wnb_SOURCE_DIR}/lib)
INCLUDE_DIRECTORIES(${PROJECT_BINARY_DIR})
SET(PROJECT_VERSION "0.6")
SET(ARCHIVE_NAME ${CMAKE_PROJECT_NAME}-${PROJECT_VERSION})
ADD_CUSTOM_TARGET(dist
COMMAND git archive --prefix=${ARCHIVE_NAME}/ HEAD
| bzip2 > ${CMAKE_BINARY_DIR}/${ARCHIVE_NAME}.tar.bz2
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
ADD_CUSTOM_TARGET(check
COMMAND ./check/check.sh ./check/list.txt)
## Compiler flags
IF (CMAKE_COMPILER_IS_GNUCXX)
list(APPEND CMAKE_CXX_FLAGS " --std=c++11 -O3 -DNDEBUG -Wall -Wextra")
#list(APPEND CMAKE_CXX_FLAGS " -g -Wall -Wextra")
ENDIF()
SET(WNB_SRCS wnb/core/wordnet.cc
wnb/core/load_wordnet.cc wnb/core/info_helper.cc)
# Executable
#--------------------------------------------------
ADD_EXECUTABLE (wntest wnb/main.cc ${WNB_SRCS})
SET(EXECUTABLE_OUTPUT_PATH ${wnb_BINARY_DIR}/bin)
# Static library
#--------------------------------------------------
ADD_LIBRARY(wnb ${WNB_SRCS})
SET(LIBRARY_OUTPUT_PATH ${wnb_BINARY_DIR}/lib)
IF (Boost_FOUND)
TARGET_LINK_LIBRARIES(wntest ${Boost_LIBRARIES})
TARGET_LINK_LIBRARIES(wnb ${Boost_LIBRARIES})
ENDIF()

View File

@ -0,0 +1,43 @@
=====================================================================
WordNet Blast
=====================================================================
In memory access to the wordnet onthology.
DEPENDENCIES:
boost 1.46
wordnet-sense-index
colordiff (for wntest)
INSTALL:
cmake CMakeLists.txt
make
TESTS: (Beta)
make check
USAGE:
#include "wordnet.hh"
#include "wnb/nltk_similarity.hh"
using namespace std;
using namespace wnb;
int main()
{
wordnet wn(PATH_TO_WORDNET);
vector<synset> synsets1 = wn.get_synsets("cat");
vector<synset> synsets2 = wn.get_synsets("dog");
nltk_similarity similarity(wn);
float d = similarity(synsets1[0], synsets2[0], 6);
}
BUGS:
- Word Morphing is sometimes incorrect.
REFERENCE:
George A. Miller (1995). WordNet: A Lexical Database for English.
Communications of the ACM Vol. 38, No. 11: 39-41.

View File

@ -0,0 +1,25 @@
This license is available as the file LICENSE in any downloaded version of
WordNet.
WordNet Release 3.0
This software and database is being provided to you, the LICENSEE, by Princeton
University under the following license. By obtaining, using and/or copying this
software and database, you agree that you have read, understood, and will comply
with these terms and conditions.: Permission to use, copy, modify and distribute
this software and database and its documentation for any purpose and without fee
or royalty is hereby granted, provided that you agree to comply with the
following copyright notice and statements, including the disclaimer, and that
the same appear on ALL copies of the software, database and documentation,
including modifications that you make for internal use or for distribution.
WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved. THIS
SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO
REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT
LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF
MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE
LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY
PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton
University or Princeton may not be used in advertising or publicity pertaining
to distribution of the software and/or database. Title to copyright in this
software, database and any associated documentation shall at all times remain
with Princeton University and LICENSEE agrees to preserve same.

View File

@ -0,0 +1,11 @@
* 0.6
- Improve tests
- get_synsets by pos
- Load wordnet a bit faster
- Fix build on Mac Os (thanks to Roman Kutlak)
- Update doc
- Improve testing
* 0.5
- get_synsets
with morphing partially implemented (thanks to Yaron Feigin)
- sense similarity

View File

@ -0,0 +1,852 @@
a
able
about
account
acid
across
act
addition
adjustment
advertisement
after
again
against
agreement
air
all
almost
among
amount
amusement
and
angle
angry
animal
answer
ant
any
apparatus
apple
approval
arch
argument
arm
army
art
as
at
attack
attempt
attention
attraction
authority
automatic
awake
baby
back
bad
bag
balance
ball
band
base
basin
basket
bath
be
beautiful
because
bed
bee
before
behaviour
belief
bell
bent
berry
between
bird
birth
bit
bite
bitter
black
blade
blood
blow
blue
board
boat
body
boiling
bone
book
boot
bottle
box
boy
brain
brake
branch
brass
bread
breath
brick
bridge
bright
broken
brother
brown
brush
bucket
building
bulb
burn
burst
business
but
butter
button
by
cake
camera
canvas
card
care
carriage
cart
cat
cause
certain
chain
chalk
chance
change
cheap
cheese
chemical
chest
chief
chin
church
circle
clean
clear
clock
cloth
cloud
coal
coat
cold
collar
colour
comb
come
comfort
committee
common
company
comparison
competition
complete
complex
condition
connection
conscious
control
cook
copper
copy
cord
cork
cotton
cough
country
cover
cow
crack
credit
crime
cruel
crush
cry
cup
cup
current
curtain
curve
cushion
damage
danger
dark
daughter
day
dead
dear
death
debt
decision
deep
degree
delicate
dependent
design
desire
destruction
detail
development
different
digestion
direction
dirty
discovery
discussion
disease
disgust
distance
distribution
division
do
dog
door
doubt
down
drain
drawer
dress
drink
driving
drop
dry
dust
ear
early
earth
east
edge
education
effect
egg
elastic
electric
end
engine
enough
equal
error
even
event
ever
every
example
exchange
existence
expansion
experience
expert
eye
face
fact
fall
false
family
far
farm
fat
father
fear
feather
feeble
feeling
female
fertile
fiction
field
fight
finger
fire
first
fish
fixed
flag
flame
flat
flight
floor
flower
fly
fold
food
foolish
foot
for
force
fork
form
forward
fowl
frame
free
frequent
friend
from
front
fruit
full
future
garden
general
get
girl
give
glass
glove
go
goat
gold
good
government
grain
grass
great
green
grey
grip
group
growth
guide
gun
hair
hammer
hand
hanging
happy
harbour
hard
harmony
hat
hate
have
he
head
healthy
hear
hearing
heart
heat
help
high
history
hole
hollow
hook
hope
horn
horse
hospital
hour
house
how
humour
I
ice
idea
if
ill
important
impulse
in
increase
industry
ink
insect
instrument
insurance
interest
invention
iron
island
jelly
jewel
join
journey
judge
jump
keep
kettle
key
kick
kind
kiss
knee
knife
knot
knowledge
land
language
last
late
laugh
law
lead
leaf
learning
leather
left
leg
let
letter
level
library
lift
light
like
limit
line
linen
lip
liquid
list
little
living
lock
long
look
loose
loss
loud
love
low
machine
make
male
man
manager
map
mark
market
married
mass
match
material
may
meal
measure
meat
medical
meeting
memory
metal
middle
military
milk
mind
mine
minute
mist
mixed
money
monkey
month
moon
morning
mother
motion
mountain
mouth
move
much
muscle
music
nail
name
narrow
nation
natural
near
necessary
neck
need
needle
nerve
net
new
news
night
no
noise
normal
north
nose
not
note
now
number
nut
observation
of
off
offer
office
oil
old
on
only
open
operation
opinion
opposite
or
orange
order
organization
ornament
other
out
oven
over
owner
page
pain
paint
paper
parallel
parcel
part
past
paste
payment
peace
pen
pencil
person
physical
picture
pig
pin
pipe
place
plane
plant
plate
play
please
pleasure
plough
pocket
point
poison
polish
political
poor
porter
position
possible
pot
potato
powder
power
present
price
print
prison
private
probable
process
produce
profit
property
prose
protest
public
pull
pump
punishment
purpose
push
put
quality
question
quick
quiet
quite
rail
rain
range
rat
rate
ray
reaction
reading
ready
reason
receipt
record
red
regret
regular
relation
religion
representative
request
respect
responsible
rest
reward
rhythm
rice
right
ring
river
road
rod
roll
roof
room
root
rough
round
rub
rule
run
sad
safe
sail
salt
same
sand
say
scale
school
science
scissors
screw
sea
seat
second
secret
secretary
see
seed
seem
selection
self
send
sense
separate
serious
servant
sex
shade
shake
shame
sharp
sheep
shelf
ship
shirt
shock
shoe
short
shut
side
sign
silk
silver
simple
sister
size
skin
skirt
sky
sleep
slip
slope
slow
small
smash
smell
smile
smoke
smooth
snake
sneeze
snow
so
soap
society
sock
soft
solid
some
son
song
sort
sound
soup
south
space
spade
special
sponge
spoon
spring
square
stage
stamp
star
start
statement
station
steam
steel
stem
step
stick
sticky
stiff
still
stitch
stocking
stomach
stone
stop
store
story
straight
strange
street
stretch
strong
structure
substance
such
sudden
sugar
suggestion
summer
sun
support
surprise
sweet
swim
system
table
tail
take
talk
tall
taste
tax
teaching
tendency
test
than
that
the
then
theory
there
thick
thin
thing
this
thought
thread
throat
through
through
thumb
thunder
ticket
tight
till
time
tin
tired
to
toe
together
tomorrow
tongue
tooth
top
touch
town
trade
train
transport
tray
tree
trick
trouble
trousers
true
turn
twist
umbrella
under
unit
up
use
value
verse
very
vessel
view
violent
voice
waiting
walk
wall
war
warm
wash
waste
watch
water
wave
wax
way
weather
week
weight
well
west
wet
wheel
when
where
while
whip
whistle
white
who
why
wide
will
wind
window
wine
wing
winter
wire
wise
with
woman
wood
wool
word
work
worm
wound
writing
wrong
year
yellow
yes
yesterday
you
young

View File

@ -0,0 +1,16 @@
#!/bin/bash
WNHOME=/usr/share/wordnet/
check() {
local word_list="$1"
echo "./bin/wntest $WNHOME ${word_list}"
time ./bin/wntest $WNHOME ${word_list} > ${word_list}.blast
echo "for i in \`cat ${word_list}\`; do wn $i -over; done"
time for i in `cat ${word_list}`; do wn $i -over; done > ${word_list}.wn
echo "diff ${word_list}.wn ${word_list}.blast -b"
colordiff -y ${word_list}.wn ${word_list}.blast -b
}
check "$1"

View File

@ -0,0 +1,7 @@
cat
lions
city
building
salvation
medications
haven

View File

@ -0,0 +1,72 @@
#ifndef _BFS_HH
# define _BFS_HH
# include <boost/graph/breadth_first_search.hpp>
# include <boost/graph/filtered_graph.hpp>
namespace wnb
{
struct synset;
namespace bfs // breadth first search tools
{
/// bfs_visitor
/// Sum distances and throw answer if target synset found
template <typename DistanceMap>
class distance_recorder : public boost::default_bfs_visitor
{
public:
distance_recorder(DistanceMap dist, const synset& s, int max)
: d(dist), target(s), max_length(max)
{ }
template <typename Edge, typename Graph>
void tree_edge(Edge e, const Graph& g) const
{
typename boost::graph_traits<Graph>::vertex_descriptor
u = boost::source(e, g), v = boost::target(e, g);
d[v] = d[u] + 1;
if (g[v] == target)
throw d[v];
if (d[v] > max_length)
throw -1;
}
private:
DistanceMap d;
const synset& target;
int max_length;
};
/// Convenience function
template <typename DistanceMap>
distance_recorder<DistanceMap>
record_distance(DistanceMap d, const synset& s, int m)
{
return distance_recorder<DistanceMap>(d, s, m);
}
/// This predicate function object determines which edges of the original
/// graph will show up in the filtered graph.
//FIXME: Do we really need a map here (check cost of property_map construction
// / should be light)
template <typename PointerSymbolMap>
struct hypo_hyper_edge {
hypo_hyper_edge() { }
hypo_hyper_edge(PointerSymbolMap pointer_symbol)
: m_pointer_symbol(pointer_symbol) { }
template <typename Edge>
bool operator()(const Edge& e) const {
int p_s = get(m_pointer_symbol, e);
//see pointer symbol list in info_helper.hh
return p_s == 1 || p_s == 2 || p_s == 3 || p_s == 4;
}
PointerSymbolMap m_pointer_symbol;
};
} // end of wnb::bfs
} // end of namespace wnb
#endif /* _BFS_HH */

View File

@ -0,0 +1,148 @@
#include "info_helper.hh"
#include <iostream>
#include <fstream>
#include <sstream>
#include <map>
#include <cassert>
namespace wnb
{
// Class info_helper
/// List of pointer symbols
const char *
info_helper::symbols[info_helper::NB_SYMBOLS] = {
"!" , // 0 Antonym
"@" , // 1 Hypernym
"@i", // 2 Instance Hypernym
"~" , // 3 Hyponym
"~i", // 4 Instance Hyponym
"#m", // 5 Member holonym
"#s", // 6 Substance holonym
"#p", // 7 Part holonym
"%m", // 8 Member meronym
"%s", // 9 Substance meronym
"%p", // 10 Part meronym
"=" , // 11 Attribute
"+" , // 12 Derivationally related form
";c", // 13 Domain of synset - TOPIC
"-c", // 14 Member of this domain - TOPIC
";r", // 15 Domain of synset - REGION
"-r", // 16 Member of this domain - REGION
";u", // 17 Domain of synset - USAGE
"-u", // 18 Member of this domain - USAGE
//The pointer_symbol s for verbs are:
"*", // 19 Entailment
">", // 20 Cause
"^", // 21 Also see
"$", // 22 Verb Group
//The pointer_symbol s for adjectives are:
"&", // 23 Similar to
"<", // 24 Participle of verb
"\\", // 25 Pertainym (pertains to noun)
"=", // 26 Attribute
};
const std::string info_helper::sufx[] = {
/* Noun suffixes */
"s", "ses", "xes", "zes", "ches", "shes", "men", "ies",
/* Verb suffixes */
"s", "ies", "es", "es", "ed", "ed", "ing", "ing",
/* Adjective suffixes */
"er", "est", "er", "est"
};
const std::string info_helper::addr[] = {
/* Noun endings */
"", "s", "x", "z", "ch", "sh", "man", "y",
/* Verb endings */
"", "y", "e", "", "e", "", "e", "",
/* Adjective endings */
"", "", "e", "e"
};
const int info_helper::offsets[info_helper::NUMPARTS] = { 0, 0, 8, 16, 0, 0 };
const int info_helper::cnts[info_helper::NUMPARTS] = { 0, 8, 8, 4, 0, 0 };
void
info_helper::update_pos_maps()
{
// http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3
indice_offset[UNKNOWN] = 0;
indice_offset[N] = 0;
indice_offset[V] = indice_offset[N] + pos_maps[N].size();
indice_offset[A] = indice_offset[V] + pos_maps[V].size();
indice_offset[R] = indice_offset[A] + pos_maps[A].size();
indice_offset[S] = indice_offset[R] + pos_maps[R].size();
}
int info_helper::compute_indice(int offset, pos_t pos)
{
if (pos == S)
pos = A;
std::map<int,int>& map = pos_maps[pos];
assert(pos <= 5 && pos > 0);
return indice_offset[pos] + map[offset];
}
// Function definitions
// Return relation between synset indices and offsets
static
std::map<int,int>
preprocess_data(const std::string& fn)
{
std::map<int,int> map;
std::ifstream file(fn.c_str());
if (!file.is_open())
throw std::runtime_error("preprocess_data: File not found: " + fn);
std::string row;
//skip header
const unsigned int header_nb_lines = 29;
for(std::size_t i = 0; i < header_nb_lines; i++)
std::getline(file, row);
int ind = 0;
//parse data line
while (std::getline(file, row))
{
std::stringstream srow(row);
int offset;
srow >> offset;
map.insert(std::pair<int,int>(offset, ind));
ind++;
}
file.close();
return map;
}
info_helper
preprocess_wordnet(const std::string& dn)
{
info_helper info;
info.pos_maps[N] = preprocess_data((dn + "data.noun")); // noun_map
info.pos_maps[V] = preprocess_data((dn + "data.verb")); // verb_map
info.pos_maps[A] = preprocess_data((dn + "data.adj")); // adj_map
info.pos_maps[R] = preprocess_data((dn + "data.adv")); // adv_map
info.update_pos_maps();
return info;
}
} // end of namespace wnb

View File

@ -0,0 +1,85 @@
#pragma once
# include <string>
# include <stdexcept>
# include <map>
# include "pos_t.hh"
namespace wnb
{
/// Useful information for wordnet in-memory import
struct info_helper
{
/// Symbols' size
static const std::size_t NB_SYMBOLS = 27;
static const std::size_t NUMPARTS = POS_ARRAY_SIZE;
/// List of pointer symbols
static const char * symbols[NB_SYMBOLS];
static const std::string sufx[];
static const std::string addr[];
static const int offsets[NUMPARTS];
static const int cnts[NUMPARTS];
typedef std::map<int,int> i2of_t; ///< indice/offset correspondences
typedef std::map<pos_t, i2of_t> pos_i2of_t; ///< pos / map correspondences
/// Constructor
info_helper() { update_pos_maps(); }
/// Compute the number of synsets (i.e. the number of vertex in the graph)
unsigned nb_synsets()
{
typedef pos_i2of_t::iterator iter_t;
int sum = 0;
for (iter_t it = pos_maps.begin(); it != pos_maps.end(); it++)
sum += (*it).second.size();
return sum;
//return adj_map.size() + adv_map.size() + noun_map.size() + verb_map.size();
}
// Given a pos return the starting indice in the graph
int get_indice_offset(pos_t pos)
{
return indice_offset[pos];
}
/// Helper function computing global indice in graph from local offset
int compute_indice(int offset, pos_t pos);
/// Update a map allowing one to get the correct map given a pos
void update_pos_maps();
int get_symbol(const std::string& ps)
{
for (unsigned i = 0; i < NB_SYMBOLS; i++)
if (ps == symbols[i])
return i;
throw std::runtime_error("Symbol NOT FOUND.");
}
pos_t get_pos(const char& c)
{
return get_pos_from_char(c);
}
public:
// i2of_t adj_map;
// i2of_t adv_map;
// i2of_t noun_map;
// i2of_t verb_map;
pos_i2of_t pos_maps;
std::size_t indice_offset[POS_ARRAY_SIZE];
};
/// Create a new info_help based on wordnet data located in dn (../dict/)
info_helper preprocess_wordnet(const std::string& dn);
} // end of namespace wncpp

View File

@ -0,0 +1,381 @@
#include "load_wordnet.hh"
#include <fstream>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <algorithm>
#include <utility>
#include <boost/graph/adjacency_list.hpp>
#include <boost/progress.hpp>
#include <boost/algorithm/string.hpp>
#include <wnb/std_ext.hh>
#include "wordnet.hh"
#include "info_helper.hh"
#include "pos_t.hh"
namespace bg = boost::graph;
namespace wnb
{
namespace
{
// Load synset's words
void load_data_row_words(std::stringstream& srow, synset& synset)
{
srow >> std::hex >> synset.w_cnt >> std::dec;
for (std::size_t i = 0; i < synset.w_cnt; i++)
{
//word lex_id
std::string word;
srow >> word;
synset.words.push_back(word);
int lex_id;
srow >> std::hex >> lex_id >> std::dec;
synset.lex_ids.push_back(lex_id);
}
}
// Add rel to graph
void add_wordnet_rel(std::string& pointer_symbol_,// type of relation
int synset_offset, // dest offset
pos_t pos, // p.o.s. of dest
int src, // word src
int trgt, // word target
synset& synset, // source synset
wordnet& wn, // our wordnet
info_helper& info) // helper
{
//if (pos == S || synset.pos == S)
// return; //FIXME: check where are s synsets.
int u = synset.id;
int v = info.compute_indice(synset_offset, pos);
ptr p;
p.pointer_symbol = info.get_symbol(pointer_symbol_);
p.source = src;
p.target = trgt;
boost::add_edge(u,v, p, wn.wordnet_graph);
}
// load ptrs
void load_data_row_ptrs(std::stringstream& srow, synset& synset,
wordnet& wn, info_helper& info)
{
srow >> synset.p_cnt;
for (std::size_t i = 0; i < synset.p_cnt; i++)
{
//http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3
//pointer_symbol synset_offset pos source/target
std::string pointer_symbol_;
int synset_offset;
pos_t pos;
int src;
int trgt;
srow >> pointer_symbol_;
srow >> synset_offset;
char c;
srow >> c;
pos = info.get_pos(c);
//print extracted edges
//std::cout << "(" << pointer_symbol << ", " << synset_offset;
//std::cout << ", " << pos << ")" << std::endl;
// Extract source/target words info
std::string src_trgt;
srow >> src_trgt;
std::stringstream ssrc(std::string(src_trgt,0,2));
std::stringstream strgt(std::string(src_trgt,2,2));
ssrc >> std::hex >> src >> std::dec;
strgt >> std::hex >> trgt >> std::dec;
add_wordnet_rel(pointer_symbol_, synset_offset, pos, src, trgt, synset, wn, info);
}
}
// Load a synset and add it to the wordnet class.
void load_data_row(const std::string& row, wordnet& wn, info_helper& info)
{
//http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3
// synset_offset lex_filenum ss_type w_cnt word lex_id [word lex_id...] p_cnt [ptr...] [frames...] | gloss
synset synset;
std::stringstream srow(row);
int synset_offset;
srow >> synset_offset;
srow >> synset.lex_filenum;
char ss_type;
srow >> ss_type;
// extra information
synset.pos = info.get_pos(ss_type);
synset.id = info.compute_indice(synset_offset, synset.pos);
// words
load_data_row_words(srow, synset);
// ptrs
load_data_row_ptrs(srow, synset, wn, info);
//frames (skipped)
std::string tmp;
while (srow >> tmp)
if (tmp == "|")
break;
// gloss
std::getline(srow, synset.gloss);
// extra
synset.sense_number = 0;
// Add synset to graph
wn.wordnet_graph[synset.id] = synset;
}
// Parse data.noun files
void load_wordnet_data(const std::string& fn, wordnet& wn, info_helper& info)
{
std::ifstream fin(fn.c_str());
if (!fin.is_open())
throw std::runtime_error("File missing: " + fn);
static const int MAX_LENGTH = 20480;
char row[MAX_LENGTH];
//skip header
for(unsigned i = 0; i < 29; i++)
fin.getline(row, MAX_LENGTH);
//parse data line
while (fin.getline(row, MAX_LENGTH))
load_data_row(row, wn, info);
fin.close();
}
//FIXME: It seems possible to replace synset_offsets with indice here.
void load_index_row(const std::string& row, wordnet& wn, info_helper& info)
{
// lemma pos synset_cnt p_cnt [ptr_symbol...] sense_cnt tagsense_cnt synset_offset [synset_offset...]
index index;
std::stringstream srow(row);
char pos;
srow >> index.lemma;
srow >> pos;
index.pos = info.get_pos(pos); // extra data
srow >> index.synset_cnt;
srow >> index.p_cnt;
std::string tmp_p;
for (std::size_t i = 0; i < index.p_cnt; i++)
{
srow >> tmp_p;
index.ptr_symbols.push_back(tmp_p);
}
srow >> index.sense_cnt;
srow >> index.tagsense_cnt;
int tmp_o;
while (srow >> tmp_o)
{
index.synset_offsets.push_back(tmp_o);
index.synset_ids.push_back(info.compute_indice(tmp_o, index.pos)); // extra data
}
//add synset to index list
wn.index_list.push_back(index);
}
void load_wordnet_index(const std::string& fn, wordnet& wn, info_helper& info)
{
std::ifstream fin(fn.c_str());
if (!fin.is_open())
throw std::runtime_error("File Not Found: " + fn);
static const int MAX_LENGTH = 20480;
char row[MAX_LENGTH];
//skip header
const unsigned int header_nb_lines = 29;
for(std::size_t i = 0; i < header_nb_lines; i++)
fin.getline(row, MAX_LENGTH);
//parse data line
while (fin.getline(row, MAX_LENGTH))
load_index_row(row, wn, info);
fin.close();
}
void load_wordnet_exc(const std::string& dn, std::string cat,
wordnet& wn, info_helper&)
{
std::string fn = dn + cat + ".exc";
std::ifstream fin(fn.c_str());
if (!fin.is_open())
throw std::runtime_error("File Not Found: " + fn);
std::map<std::string,std::string>& exc = wn.exc[get_pos_from_name(cat)];
std::string row;
std::string key, value;
while (std::getline(fin, row))
{
std::stringstream srow(row);
srow >> key;
srow >> value;
exc[key] = value;
}
}
void load_wordnet_cat(const std::string dn, std::string cat,
wordnet& wn, info_helper& info)
{
load_wordnet_data((dn + "data." + cat), wn, info);
load_wordnet_index((dn + "index." + cat), wn, info);
load_wordnet_exc(dn, cat, wn, info);
}
// FIXME: this file is not in all packaged version of wordnet
void load_wordnet_index_sense(const std::string& dn, wordnet& wn, info_helper& info)
{
std::string fn = dn + "index.sense";
std::ifstream fin(fn.c_str());
if (!fin.is_open())
throw std::runtime_error("File Not Found: " + fn);
std::string row;
std::string sense_key;
int synset_offset;
while (std::getline(fin, row))
{
std::stringstream srow(row);
srow >> sense_key;
// Get the pos of the lemma
std::vector<std::string> sk = ext::split(sense_key,'%');
std::string word = sk.at(0);
std::stringstream tmp(ext::split(sk.at(1), ':').at(0));
int ss_type;
tmp >> ss_type;
pos_t pos = (pos_t) ss_type;
srow >> synset_offset;
// Update synset info
int u = info.compute_indice(synset_offset, pos);
int sense_number;
srow >> sense_number;
wn.wordnet_graph[u].sense_number += sense_number;
int tag_cnt;
srow >> tag_cnt;
if (tag_cnt != 0)
wn.wordnet_graph[u].tag_cnts.push_back( make_pair(word,tag_cnt) );
//if (synset_offset == 2121620)
// std::cout << u << " " << word << " " << synset_offset << " "
// << wn.wordnet_graph[u].tag_cnt << " "
// << wn.wordnet_graph[u].words[0] << std::endl;
}
}
// wn -over used info in cntlist even if this is deprecated
// It is ok not to FIX and use this function
void load_wordnet_cntlist(const std::string& dn, wordnet& wn, info_helper& info)
{
std::string fn = dn + "cntlist";
std::ifstream fin(fn.c_str());
if (!fin.is_open())
throw std::runtime_error("File Not Found: " + fn);
std::string sense_key;
int sense_number;
int tag_cnt;
std::string row;
while (std::getline(fin, row))
{
std::stringstream srow(row);
srow >> sense_key;
srow >> sense_number;
srow >> tag_cnt;
// Get the pos of the lemma
std::string word = ext::split(sense_key,'%').at(0);
std::stringstream tmp(ext::split(ext::split(sense_key,'%').at(1), ':').at(0));
int ss_type;
tmp >> ss_type;
pos_t pos = (pos_t) ss_type;
// Update synset info
int synset_offset; // FIXME
int u = info.compute_indice(synset_offset, pos);
wn.wordnet_graph[u].sense_number += sense_number;
if (tag_cnt != 0)
wn.wordnet_graph[u].tag_cnts.push_back( make_pair(word,tag_cnt) );
}
}
} // end of anonymous namespace
void load_wordnet(const std::string& dn, wordnet& wn, info_helper& info)
{
// vertex added in this order a n r v
std::string fn = dn;
if (wn._verbose)
{
std::cout << std::endl;
std::cout << "### Loading Wordnet 3.0";
boost::progress_display show_progress(5);
boost::progress_timer t;
load_wordnet_cat(dn, "adj", wn, info);
++show_progress;
load_wordnet_cat(dn, "noun", wn, info);
++show_progress;
load_wordnet_cat(dn, "adv", wn, info);
++show_progress;
load_wordnet_cat(dn, "verb", wn, info);
++show_progress;
load_wordnet_index_sense(dn, wn, info);
++show_progress;
std::cout << std::endl;
}
else
{
load_wordnet_cat(dn, "adj", wn, info);
load_wordnet_cat(dn, "noun", wn, info);
load_wordnet_cat(dn, "adv", wn, info);
load_wordnet_cat(dn, "verb", wn, info);
load_wordnet_index_sense(dn, wn, info);
}
std::stable_sort(wn.index_list.begin(), wn.index_list.end());
}
} // end of namespace wnb

View File

@ -0,0 +1,12 @@
#pragma once
# include "info_helper.hh"
namespace wnb
{
/// forward declaration
struct wordnet;
/// Load the entire wordnet data base located in \p dn (typically .../dict/)
void load_wordnet(const std::string& dn, wordnet& wn, info_helper& info);
}

View File

@ -0,0 +1,61 @@
#pragma once
namespace wnb
{
static const std::size_t POS_ARRAY_SIZE = 6;
static const char POS_ARRAY[POS_ARRAY_SIZE] = {'u', 'n', 'v', 'a', 'r', 's'};
enum pos_t
{
UNKNOWN = 0,
N = 1,
V = 2,
A = 3,
R = 4,
S = 5,
};
inline pos_t get_pos_from_name(const std::string& pos)
{
if (pos == "adj")
return A;
if (pos == "noun")
return N;
if (pos == "adv")
return R;
if (pos == "verb")
return V;
if (pos == "adj sat")
return S;
return UNKNOWN;
}
inline std::string get_name_from_pos(const pos_t& pos)
{
switch (pos)
{
case A: return "adj";
case N: return "noun";
case R: return "adv";
case V: return "verb";
case S: return "adj sat";
default: return "UNKNOWN";
}
}
inline pos_t get_pos_from_char(const char& c)
{
switch (c)
{
case 'a': return A;
case 'n': return N;
case 'r': return R;
case 'v': return V;
case 's': return S;
default: return UNKNOWN;
}
}
} // end of namespace wncpp

View File

@ -0,0 +1,186 @@
#include <wnb/core/wordnet.hh>
#include <wnb/std_ext.hh>
#include <string>
#include <set>
#include <algorithm>
#include <stdexcept>
#include <boost/graph/breadth_first_search.hpp>
#include <boost/graph/filtered_graph.hpp>
namespace wnb
{
//FIXME: Make (smart) use of fs::path
wordnet::wordnet(const std::string& wordnet_dir, bool verbose)
: _verbose(verbose)
{
if (_verbose)
{
std::cout << wordnet_dir << std::endl;
}
info = preprocess_wordnet(wordnet_dir);
wordnet_graph = graph(info.nb_synsets());
load_wordnet(wordnet_dir, *this, info);
if (_verbose)
{
std::cout << "nb_synsets: " << info.nb_synsets() << std::endl;
}
//FIXME: this check is only valid for Wordnet 3.0
//assert(info.nb_synsets() == 142335);//117659);
assert(info.nb_synsets() > 0);
}
std::vector<synset>
wordnet::get_synsets(const std::string& word, pos_t pos)
{
std::vector<synset> synsets;
// morphing
std::string mword = morphword(word, pos);
if (mword == "")
return synsets;
// binary_search
typedef std::vector<index> vi;
std::pair<vi::iterator,vi::iterator> bounds = get_indexes(mword);
vi::iterator it;
for (it = bounds.first; it != bounds.second; it++)
{
if (pos == pos_t::UNKNOWN || it->pos == pos)
{
for (std::size_t i = 0; i < it->synset_ids.size(); i++)
{
int id = it->synset_ids[i];
synsets.push_back(wordnet_graph[id]);
}
}
}
return synsets;
}
const std::vector<std::string> *
wordnet::get_synset(const std::string& word, pos_t pos) const {
typedef std::vector<index> vi;
std::pair<vi::const_iterator,vi::const_iterator> bounds = get_indexes_const(word);
for (vi::const_iterator it = bounds.first; it != bounds.second; it++)
{
if (pos == pos_t::UNKNOWN || it->pos == pos)
{
int id = it->synset_ids[0];
return &wordnet_graph[id].words;
}
}
return nullptr;
}
std::pair<std::vector<index>::const_iterator, std::vector<index>::const_iterator>
wordnet::get_indexes_const(const std::string& word) const
{
index light_index;
light_index.lemma = word;
typedef std::vector<index> vi;
std::pair<vi::const_iterator,vi::const_iterator> bounds =
std::equal_range(index_list.begin(), index_list.end(), light_index);
return bounds;
}
std::pair<std::vector<index>::iterator, std::vector<index>::iterator>
wordnet::get_indexes(const std::string& word)
{
index light_index;
light_index.lemma = word;
typedef std::vector<index> vi;
std::pair<vi::iterator,vi::iterator> bounds =
std::equal_range(index_list.begin(), index_list.end(), light_index);
return bounds;
}
std::string
wordnet::wordbase(const std::string& word, int ender)
{
if (ext::ends_with(word, info.sufx[ender]))
{
int sufxlen = info.sufx[ender].size();
std::string strOut = word.substr(0, word.size() - sufxlen);
if (!info.addr[ender].empty())
strOut += info.addr[ender];
return strOut;
}
return word;
}
bool is_defined(const std::string& word, pos_t pos)
{
// hack FIXME: Some verbs are built with -e suffix ('builde' is just an example).
if (pos == V && word == "builde")
return false;
return true;
}
// Try to find baseform (lemma) of individual word in POS
std::string
wordnet::morphword(const std::string& word, pos_t pos)
{
// first look for word on exception list
exc_t::iterator it = exc[pos].find(word);
if (it != exc[pos].end())
return it->second; // found in exception list
std::string tmpbuf;
std::string end;
int cnt = 0;
if (pos == R)
return ""; // Only use exception list for adverbs
if (pos == N)
{
if (ext::ends_with(word, "ful"))
{
cnt = word.size() - 3;
tmpbuf = word.substr(0, cnt);
end = "ful";
}
else
{
// check for noun ending with 'ss' or short words
if (ext::ends_with(word, "ss") || word.size() <= 2)
return "";
}
}
// If not in exception list, try applying rules from tables
if (tmpbuf.size() == 0)
tmpbuf = word;
if (pos != pos_t::UNKNOWN)
{
int offset = info.offsets[pos];
int pos_cnt = info.cnts[pos];
std::string morphed;
for (int i = 0; i < pos_cnt; i++)
{
morphed = wordbase(tmpbuf, (i + offset));
if (morphed != tmpbuf && is_defined(morphed, pos))
return morphed + end;
}
return morphed;
}
return word;
}
} // end of namespace wnb

View File

@ -0,0 +1,113 @@
#pragma once
# include <iostream>
# include <string>
# include <cassert>
# include <vector>
//# include <boost/filesystem.hpp>
//Possible https://bugs.launchpad.net/ubuntu/+source/boost/+bug/270873
# include <boost/graph/graph_traits.hpp>
# include <boost/graph/adjacency_list.hpp>
# include "load_wordnet.hh"
# include "pos_t.hh"
namespace wnb
{
/// More info here: http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html
struct info_helper;
/// Synset
struct synset
{
int lex_filenum;
std::size_t w_cnt;
std::vector<std::string> words;
std::vector<int> lex_ids;
std::size_t p_cnt;
std::string gloss;
// extra
pos_t pos; ///< pos (replace ss_type)
int id; ///< unique identifier (replace synset_offset)
int sense_number; ///< http://wordnet.princeton.edu/man/senseidx.5WN.html
std::vector<std::pair<std::string, int> > tag_cnts; ///< http://wordnet.princeton.edu/man/senseidx.5WN.html
bool operator==(const synset& s) const { return (id == s.id); }
bool operator<(const synset& s) const { return (id < s.id); }
};
/// Rel between synsets properties
struct ptr
{
//std::string pointer_symbol; ///< symbol of the relation
int pointer_symbol;
int source; ///< source word inside synset
int target; ///< target word inside synset
};
/// Index
struct index
{
std::string lemma;
std::size_t synset_cnt;
std::size_t p_cnt;
std::size_t sense_cnt;
float tagsense_cnt;
std::vector<std::string> ptr_symbols;
std::vector<int> synset_offsets;
// extra
std::vector<int> synset_ids;
pos_t pos;
bool operator<(const index& b) const
{
return (lemma.compare(b.lemma) < 0);
}
};
/// Wordnet interface class
struct wordnet
{
typedef boost::adjacency_list<boost::vecS, boost::vecS,
boost::directedS,
synset, ptr> graph; ///< boost graph type
/// Constructor
wordnet(const std::string& wordnet_dir, bool verbose=false);
/// Return synsets matching word
std::vector<synset> get_synsets(const std::string& word, pos_t pos = pos_t::UNKNOWN);
//FIXME: todo
std::vector<synset> get_synset(const std::string& word, char pos, int i);
// added
const std::vector<std::string> * get_synset(const std::string& word, pos_t pos = pos_t::UNKNOWN) const;
std::pair<std::vector<index>::iterator, std::vector<index>::iterator>
get_indexes(const std::string& word);
std::pair<std::vector<index>::const_iterator, std::vector<index>::const_iterator>
get_indexes_const(const std::string& word) const;
std::string wordbase(const std::string& word, int ender);
std::string morphword(const std::string& word, pos_t pos);
std::vector<index> index_list; ///< index list // FIXME: use a map
graph wordnet_graph; ///< synsets graph
info_helper info; ///< helper object
bool _verbose;
typedef std::map<std::string,std::string> exc_t;
std::map<pos_t, exc_t> exc;
};
} // end of namespace wnb

View File

@ -0,0 +1,180 @@
#include <iostream>
#include <fstream>
#include <sstream>
#include <boost/progress.hpp>
#include <boost/algorithm/string.hpp>
#include <wnb/core/wordnet.hh>
#include <wnb/core/load_wordnet.hh>
#include <wnb/core/info_helper.hh>
#include <wnb/nltk_similarity.hh>
#include <wnb/std_ext.hh>
using namespace wnb;
using namespace boost;
using namespace boost::algorithm;
bool usage(int argc, char ** argv)
{
std::string dir;
if (argc >= 2)
dir = std::string(argv[1]);
if (argc != 3 || dir[dir.length()-1] != '/')
{
std::cout << argv[0] << " .../wordnet_dir/ word_list_file" << std::endl;
return true;
}
return false;
}
struct ws
{
std::string w;
float s;
bool operator<(const ws& a) const {return s > a.s;}
};
/// Compute similarity of word with words in word list
std::vector<ws>
compute_similarities(wordnet& wn,
const std::string& word,
const std::vector<std::string>& word_list)
{
std::vector<ws> wslist;
std::vector<synset> synsets1 = wn.get_synsets(word);
for (unsigned i = 0; i < synsets1.size(); i++)
for (unsigned k = 0; k < synsets1[i].words.size(); k++)
std::cout << " - " << synsets1[i].words[k] << std::endl;
nltk_similarity path_similarity(wn);
{
progress_timer t;
progress_display show_progress(word_list.size());
for (unsigned k = 0; k < word_list.size(); k++)
{
const std::string& w = word_list[k];
float max = 0;
std::vector<synset> synsets2 = wn.get_synsets(w);
for (unsigned i = 0; i < synsets1.size(); i++)
{
for (unsigned j = 0; j < synsets2.size(); j++)
{
float s = path_similarity(synsets1[i], synsets2[j], 6);
if (s > max)
max = s;
}
}
ws e = {w, max};
wslist.push_back(e);
++show_progress;
}
}
return wslist;
}
void similarity_test(wordnet& wn,
const std::string& word,
std::vector<std::string>& word_list)
{
std::vector<ws> wslist = compute_similarities(wn, word, word_list);
std::stable_sort(wslist.begin(), wslist.end());
for (unsigned i = 0; i < std::min(wslist.size(), size_t(10)); i++)
std::cout << wslist[i].w << " " << wslist[i].s << std::endl;
}
void print_synsets(pos_t pos, wnb::index& idx, wordnet& wn)
{
std::string& mword = idx.lemma;
std::cout << "\nOverview of " << get_name_from_pos(pos) << " " << mword << "\n\n";
std::cout << "The " << get_name_from_pos(pos) << " " << mword << " has "
<< idx.synset_ids.size() << ((idx.synset_ids.size() == 1) ? " sense": " senses");
if (idx.tagsense_cnt != 0)
std::cout << " (first " << idx.tagsense_cnt << " from tagged texts)";
else
std::cout << " (no senses from tagged texts)";
std::cout << "\n";
std::cout << " \n";
for (std::size_t i = 0; i < idx.synset_ids.size(); i++)
{
int id = idx.synset_ids[i];
const synset& synset = wn.wordnet_graph[id];
std::cout << i+1 << ". ";
for (std::size_t k = 0; k < synset.tag_cnts.size(); k++)
{
if (synset.tag_cnts[k].first == mword)
std::cout << "(" << synset.tag_cnts[k].second << ") ";
}
std::vector<std::string> nwords;
for (auto& w : synset.words)
nwords.push_back((pos == A) ? w.substr(0, w.find_first_of("(")) : w);
std::cout << replace_all_copy(join(nwords, ", "), "_", " ");
std::cout << " -- (" << trim_copy(synset.gloss) << ")";
std::cout << std::endl;
}
}
void wn_like(wordnet& wn, const std::string& word, pos_t pos)
{
if (word == "")
return;
typedef std::vector<wnb::index> vi;
std::pair<vi::iterator,vi::iterator> bounds = wn.get_indexes(word);
for (vi::iterator it = bounds.first; it != bounds.second; it++)
{
if (pos != -1 && it->pos == pos)
{
print_synsets(pos, *it, wn);
}
}
}
void batch_test(wordnet& wn, std::vector<std::string>& word_list)
{
for (std::size_t i = 0; i < word_list.size(); i++)
{
for (unsigned p = 1; p < POS_ARRAY_SIZE; p++)
{
pos_t pos = (pos_t) p;
wn_like(wn, word_list[i], pos);
std::string mword = wn.morphword(word_list[i], pos);
if (mword != word_list[i])
wn_like(wn, mword, pos);
}
}
}
int main(int argc, char ** argv)
{
if (usage(argc, argv))
return 1;
// read command line
std::string wordnet_dir = argv[1];
std::string test_file = argv[2];
wordnet wn(wordnet_dir);
// read test file
std::string list = ext::read_file(test_file);
std::vector<std::string> wl = ext::split(list);
batch_test(wn, wl);
}

View File

@ -0,0 +1,146 @@
#ifndef _NLTK_SIMILARITY_HH
# define _NLTK_SIMILARITY_HH
# include <queue>
# include <boost/graph/filtered_graph.hpp>
# include <wnb/core/wordnet.hh>
namespace wnb
{
namespace internal
{
//Helper class filtering out other than hypernym relations
template <typename PointerSymbolMap>
struct hyper_edge
{
hyper_edge() { }
hyper_edge(PointerSymbolMap pointer_symbol)
: m_pointer_symbol(pointer_symbol) { }
template <typename Edge>
bool operator()(const Edge& e) const
{
int p_s = get(m_pointer_symbol, e);
return p_s == 1; // hypernyme (instance_hypernyme not used here)
}
PointerSymbolMap m_pointer_symbol;
};
} // end of anonymous namespace
class nltk_similarity
{
typedef boost::property_map<wordnet::graph,
int ptr::*>::type PointerSymbolMap;
typedef boost::filtered_graph<wordnet::graph,
internal::hyper_edge<PointerSymbolMap> > G;
typedef boost::graph_traits<G>::vertex_descriptor vertex;
internal::hyper_edge<PointerSymbolMap> filter;
G fg;
public:
nltk_similarity(wordnet& wn)
: filter(get(&ptr::pointer_symbol, wn.wordnet_graph)),
fg(wn.wordnet_graph, filter)
{ }
/// Get list of hypernyms of s along with distance to s
std::map<vertex, int> hypernym_map(vertex s);
/// Get shortest path between and synset1 and synset2.
int shortest_path_distance(const synset& synset1, const synset& synset2);
/// return disance
float operator()(const synset& synset1, const synset& synset2, int=0);
};
std::map<nltk_similarity::vertex, int>
nltk_similarity::hypernym_map(nltk_similarity::vertex s)
{
std::map<vertex, int> map;
// Python:
// for (hypernym in self[HYPERNYM])
// distances |= hypernym.hypernym_distances(distance+1);
boost::graph_traits<G>::out_edge_iterator e, e_end;
std::queue<vertex> q;
q.push(s);
map[s] = 0;
while (!q.empty())
{
vertex u = q.front(); q.pop();
int new_d = map[u] + 1;
for (boost::tuples::tie(e, e_end) = out_edges(u, fg); e != e_end; ++e)
{
vertex v = target(*e,fg);
q.push(v);
if (map.find(v) != map.end())
{
if (new_d < map[v])
map[v] = new_d;
else
q.pop();
}
else
map[v] = new_d;
}
}
return map;
}
int
nltk_similarity::shortest_path_distance(const synset& synset1, const synset& synset2)
{
vertex v1 = synset1.id;
vertex v2 = synset2.id;
std::map<vertex, int> map1 = hypernym_map(v1);
std::map<vertex, int> map2 = hypernym_map(v2);
// For each ancestor synset common to both subject synsets, find the
// connecting path length. Return the shortest of these.
int path_distance = -1;
std::map<vertex, int>::iterator it, it2;
for (it = map1.begin(); it != map1.end(); it++)
for (it2 = map2.begin(); it2 != map2.end(); it2++)
if (fg[it->first] == fg[it2->first])
{
int new_distance = it->second + it2->second;
if (path_distance < 0 || new_distance < path_distance)
path_distance = new_distance;
}
return path_distance;
}
float
nltk_similarity::operator()(const synset& synset1, const synset& synset2, int)
{
int distance = shortest_path_distance(synset1, synset2);
if (distance >= 0)
return 1. / (distance + 1);
else
return -1;
}
} // end of namespace wnb
#endif /* _NLTK_SIMILARITY_HH */

View File

@ -0,0 +1,90 @@
#ifndef _STD_EXT_HH
# define _STD_EXT_HH
# include <string>
# include <sstream>
# include <fstream>
# include <algorithm>
# include <stdexcept>
namespace ext
{
/// Read a file, return the content as a C++ string
inline
std::string read_file(const std::string& fn)
{
std::ifstream is;
is.open(fn.c_str(), std::ios::binary);
if (!is.is_open())
throw std::runtime_error("File not found: " + fn);
std::string str((std::istreambuf_iterator<char>(is)),
std::istreambuf_iterator<char>());
return str;
}
/// Split a std::string
inline
std::vector<std::string> split(const std::string& str)
{
std::vector<std::string> tokens;
std::istringstream iss(str);
copy(std::istream_iterator<std::string>(iss),
std::istream_iterator<std::string>(),
std::back_inserter< std::vector<std::string> >(tokens));
return tokens;
}
/// Split a std::string on separator
inline
std::vector<std::string> split(const std::string& s, char seperator)
{
std::vector<std::string> output;
std::string::size_type prev_pos = 0, pos = 0;
while((pos = s.find(seperator, pos)) != std::string::npos)
{
std::string substring( s.substr(prev_pos, pos-prev_pos) );
output.push_back(substring);
prev_pos = ++pos;
}
output.push_back(s.substr(prev_pos, pos-prev_pos));
return output;
}
inline
bool
ends_with(const std::string& str, const std::string& ending)
{
if (str.length() >= ending.length())
{
int cmp = str.compare(str.length() - ending.length(),
ending.length(), ending);
return (0 == cmp);
}
return false;
}
/// Sorted unique
template <typename T>
inline
T s_unique(T& v)
{
T out;
std::sort(v.begin(), v.end());
typename T::iterator last = std::unique(v.begin(),v.end());
out.resize(last - v.begin());
std::copy(v.begin(), last, out.begin());
return out;
}
} // end of ext
#endif /* _STD_EXT_HH */

View File

@ -124,3 +124,4 @@ endif()
set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-sanitize=signed-integer-overflow)
target_link_libraries(clickhouse_functions PRIVATE stemmer)
target_link_libraries(clickhouse_functions PRIVATE wnb)

View File

@ -1,10 +1,11 @@
#include <Common/Exception.h>
#include <Interpreters/SynonymsExtensions.h>
#include <Functions/SynonymsExtensions.h>
#include <fstream>
#include <list>
#include <boost/algorithm/string.hpp>
#include <wnb/core/wordnet.hh>
namespace DB
{
@ -48,7 +49,7 @@ public:
}
}
Synset * getSynonyms(const std::string_view & token) const override
const Synset * getSynonyms(const std::string_view & token) const override
{
auto it = table.find(token);
@ -62,20 +63,23 @@ public:
class WordnetSynonymsExtension : public ISynonymsExtension
{
private:
// std::vector<std::vector<String>> data;
wnb::wordnet wn;
public:
WordnetSynonymsExtension(const String & /*path*/)
{
WordnetSynonymsExtension(const String & path) : wn(path) {}
}
Synset * getSynonyms(const std::string_view & /*token*/) const override
const Synset * getSynonyms(const std::string_view & token) const override
{
return nullptr;
return wn.get_synset(std::string(token));
}
};
/// Duplicate of code from StringUtils.h. Copied here for less dependencies.
static bool startsWith(const std::string & s, const char * prefix)
{
return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix));
}
SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration & config)
{
String prefix = "synonyms_extensions";
@ -89,7 +93,7 @@ SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration &
for (const auto & key : keys)
{
if (key == "extension")
if (startsWith(key, "extension"))
{
const auto & ext_name = config.getString(prefix + "." + key + ".name", "");
const auto & ext_path = config.getString(prefix + "." + key + ".path", "");

View File

@ -19,7 +19,7 @@ public:
//ISynonymsExtension(const String & path);
virtual Synset * getSynonyms(const std::string_view & token) const = 0;
virtual const Synset * getSynonyms(const std::string_view & token) const = 0;
virtual ~ISynonymsExtension() = default;
};

View File

@ -6,7 +6,7 @@
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Interpreters/SynonymsExtensions.h>
#include <Functions/SynonymsExtensions.h>
#include <Interpreters/Context.h>
#include <string_view>

View File

@ -53,7 +53,6 @@
#include <Interpreters/InterserverCredentials.h>
#include <Interpreters/Cluster.h>
#include <Interpreters/InterserverIOHandler.h>
#include <Interpreters/SynonymsExtensions.h>
#include <Interpreters/SystemLog.h>
#include <Interpreters/Context.h>
#include <Interpreters/DDLWorker.h>
@ -75,7 +74,7 @@
#include <Interpreters/DatabaseCatalog.h>
#include <Storages/MergeTree/BackgroundJobsExecutor.h>
#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
#include <Functions/SynonymsExtensions.h>
namespace ProfileEvents
{