mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
added WordNet synonyms extensions
This commit is contained in:
parent
876f51ab95
commit
ed12fb5604
1
contrib/CMakeLists.txt
vendored
1
contrib/CMakeLists.txt
vendored
@ -331,3 +331,4 @@ endif()
|
||||
|
||||
add_subdirectory(fast_float)
|
||||
add_subdirectory(libstemmer-c-cmake)
|
||||
add_subdirectory(wordnet-blast-cmake)
|
||||
|
@ -13,11 +13,12 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
|
||||
regex
|
||||
context
|
||||
coroutine
|
||||
graph
|
||||
)
|
||||
|
||||
if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND
|
||||
Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY AND
|
||||
Boost_COROUTINE_LIBRARY)
|
||||
Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY)
|
||||
|
||||
set(EXTERNAL_BOOST_FOUND 1)
|
||||
|
||||
@ -32,6 +33,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
|
||||
add_library (_boost_system INTERFACE)
|
||||
add_library (_boost_context INTERFACE)
|
||||
add_library (_boost_coroutine INTERFACE)
|
||||
add_library (_boost_graph INTERFACE)
|
||||
|
||||
target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY})
|
||||
target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY})
|
||||
@ -40,6 +42,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
|
||||
target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY})
|
||||
target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY})
|
||||
target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY})
|
||||
target_link_libraries (_boost_graph INTERFACE ${Boost_GRAPH_LIBRARY})
|
||||
|
||||
add_library (boost::filesystem ALIAS _boost_filesystem)
|
||||
add_library (boost::iostreams ALIAS _boost_iostreams)
|
||||
@ -48,6 +51,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
|
||||
add_library (boost::system ALIAS _boost_system)
|
||||
add_library (boost::context ALIAS _boost_context)
|
||||
add_library (boost::coroutine ALIAS _boost_coroutine)
|
||||
add_library (boost::graph ALIAS _boost_graph)
|
||||
else()
|
||||
set(EXTERNAL_BOOST_FOUND 0)
|
||||
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost")
|
||||
@ -221,4 +225,16 @@ if (NOT EXTERNAL_BOOST_FOUND)
|
||||
add_library (boost::coroutine ALIAS _boost_coroutine)
|
||||
target_include_directories (_boost_coroutine PRIVATE ${LIBRARY_DIR})
|
||||
target_link_libraries(_boost_coroutine PRIVATE _boost_context)
|
||||
|
||||
# graph
|
||||
|
||||
set (SRCS_GRAPH
|
||||
"${LIBRARY_DIR}/libs/graph/src/graphml.cpp"
|
||||
"${LIBRARY_DIR}/libs/graph/src/read_graphviz_new.cpp"
|
||||
)
|
||||
|
||||
add_library (_boost_graph ${SRCS_GRAPH})
|
||||
add_library (boost::graph ALIAS _boost_graph)
|
||||
target_include_directories (_boost_graph PRIVATE ${LIBRARY_DIR})
|
||||
|
||||
endif ()
|
||||
|
13
contrib/wordnet-blast-cmake/CMakeLists.txt
Normal file
13
contrib/wordnet-blast-cmake/CMakeLists.txt
Normal file
@ -0,0 +1,13 @@
|
||||
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast")
|
||||
|
||||
set(SRCS
|
||||
"${LIBRARY_DIR}/wnb/core/info_helper.cc"
|
||||
"${LIBRARY_DIR}/wnb/core/load_wordnet.cc"
|
||||
"${LIBRARY_DIR}/wnb/core/wordnet.cc"
|
||||
)
|
||||
|
||||
add_library(wnb ${SRCS})
|
||||
|
||||
target_link_libraries(wnb PRIVATE boost::headers_only boost::graph)
|
||||
|
||||
target_include_directories(wnb PUBLIC "${LIBRARY_DIR}")
|
1
contrib/wordnet-blast/AUTHORS
Normal file
1
contrib/wordnet-blast/AUTHORS
Normal file
@ -0,0 +1 @@
|
||||
Ugo Jardonnet ugo.jardonnet/gmail
|
65
contrib/wordnet-blast/CMakeLists.txt
Normal file
65
contrib/wordnet-blast/CMakeLists.txt
Normal file
@ -0,0 +1,65 @@
|
||||
CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
|
||||
|
||||
PROJECT(wnb)
|
||||
|
||||
# Boost dependency
|
||||
#--------------------------------------------------
|
||||
|
||||
# IF (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
||||
# SET (BOOST_ROOT /Developer/boost_build/) # Suggested path
|
||||
# ELSE()
|
||||
# SET (BOOST_ROOT "/usr/include")
|
||||
# ENDIF()
|
||||
##############
|
||||
SET (BOOST_ROOT "${ClickHouse_SOURCE_DIR}/contrib/boost")
|
||||
##############
|
||||
MESSAGE(STATUS "** Search Boost root: ${BOOST_ROOT}")
|
||||
FIND_PACKAGE(Boost 1.70.0 COMPONENTS graph REQUIRED)
|
||||
MESSAGE(STATUS "** Boost Include: ${Boost_INCLUDE_DIR}")
|
||||
MESSAGE(STATUS "** Boost Libraries: ${Boost_LIBRARY_DIRS}")
|
||||
MESSAGE(STATUS "** Boost Libraries: ${Boost_LIBRARIES}")
|
||||
|
||||
INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIR})
|
||||
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
|
||||
|
||||
# Project
|
||||
#--------------------------------------------------
|
||||
|
||||
LINK_DIRECTORIES(${wnb_SOURCE_DIR}/lib)
|
||||
INCLUDE_DIRECTORIES(${PROJECT_BINARY_DIR})
|
||||
|
||||
SET(PROJECT_VERSION "0.6")
|
||||
SET(ARCHIVE_NAME ${CMAKE_PROJECT_NAME}-${PROJECT_VERSION})
|
||||
|
||||
ADD_CUSTOM_TARGET(dist
|
||||
COMMAND git archive --prefix=${ARCHIVE_NAME}/ HEAD
|
||||
| bzip2 > ${CMAKE_BINARY_DIR}/${ARCHIVE_NAME}.tar.bz2
|
||||
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
|
||||
|
||||
ADD_CUSTOM_TARGET(check
|
||||
COMMAND ./check/check.sh ./check/list.txt)
|
||||
|
||||
|
||||
## Compiler flags
|
||||
IF (CMAKE_COMPILER_IS_GNUCXX)
|
||||
list(APPEND CMAKE_CXX_FLAGS " --std=c++11 -O3 -DNDEBUG -Wall -Wextra")
|
||||
#list(APPEND CMAKE_CXX_FLAGS " -g -Wall -Wextra")
|
||||
ENDIF()
|
||||
|
||||
SET(WNB_SRCS wnb/core/wordnet.cc
|
||||
wnb/core/load_wordnet.cc wnb/core/info_helper.cc)
|
||||
|
||||
# Executable
|
||||
#--------------------------------------------------
|
||||
ADD_EXECUTABLE (wntest wnb/main.cc ${WNB_SRCS})
|
||||
SET(EXECUTABLE_OUTPUT_PATH ${wnb_BINARY_DIR}/bin)
|
||||
|
||||
# Static library
|
||||
#--------------------------------------------------
|
||||
ADD_LIBRARY(wnb ${WNB_SRCS})
|
||||
SET(LIBRARY_OUTPUT_PATH ${wnb_BINARY_DIR}/lib)
|
||||
|
||||
IF (Boost_FOUND)
|
||||
TARGET_LINK_LIBRARIES(wntest ${Boost_LIBRARIES})
|
||||
TARGET_LINK_LIBRARIES(wnb ${Boost_LIBRARIES})
|
||||
ENDIF()
|
43
contrib/wordnet-blast/README
Normal file
43
contrib/wordnet-blast/README
Normal file
@ -0,0 +1,43 @@
|
||||
|
||||
=====================================================================
|
||||
WordNet Blast
|
||||
=====================================================================
|
||||
|
||||
In memory access to the wordnet onthology.
|
||||
|
||||
DEPENDENCIES:
|
||||
boost 1.46
|
||||
wordnet-sense-index
|
||||
colordiff (for wntest)
|
||||
|
||||
INSTALL:
|
||||
cmake CMakeLists.txt
|
||||
make
|
||||
|
||||
TESTS: (Beta)
|
||||
make check
|
||||
|
||||
USAGE:
|
||||
#include "wordnet.hh"
|
||||
#include "wnb/nltk_similarity.hh"
|
||||
|
||||
using namespace std;
|
||||
using namespace wnb;
|
||||
|
||||
int main()
|
||||
{
|
||||
wordnet wn(PATH_TO_WORDNET);
|
||||
|
||||
vector<synset> synsets1 = wn.get_synsets("cat");
|
||||
vector<synset> synsets2 = wn.get_synsets("dog");
|
||||
|
||||
nltk_similarity similarity(wn);
|
||||
float d = similarity(synsets1[0], synsets2[0], 6);
|
||||
}
|
||||
|
||||
BUGS:
|
||||
- Word Morphing is sometimes incorrect.
|
||||
|
||||
REFERENCE:
|
||||
George A. Miller (1995). WordNet: A Lexical Database for English.
|
||||
Communications of the ACM Vol. 38, No. 11: 39-41.
|
25
contrib/wordnet-blast/WORDNET_LICENSE
Normal file
25
contrib/wordnet-blast/WORDNET_LICENSE
Normal file
@ -0,0 +1,25 @@
|
||||
This license is available as the file LICENSE in any downloaded version of
|
||||
WordNet.
|
||||
|
||||
WordNet Release 3.0
|
||||
|
||||
This software and database is being provided to you, the LICENSEE, by Princeton
|
||||
University under the following license. By obtaining, using and/or copying this
|
||||
software and database, you agree that you have read, understood, and will comply
|
||||
with these terms and conditions.: Permission to use, copy, modify and distribute
|
||||
this software and database and its documentation for any purpose and without fee
|
||||
or royalty is hereby granted, provided that you agree to comply with the
|
||||
following copyright notice and statements, including the disclaimer, and that
|
||||
the same appear on ALL copies of the software, database and documentation,
|
||||
including modifications that you make for internal use or for distribution.
|
||||
WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved. THIS
|
||||
SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO
|
||||
REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT
|
||||
LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF
|
||||
MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE
|
||||
LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY
|
||||
PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton
|
||||
University or Princeton may not be used in advertising or publicity pertaining
|
||||
to distribution of the software and/or database. Title to copyright in this
|
||||
software, database and any associated documentation shall at all times remain
|
||||
with Princeton University and LICENSEE agrees to preserve same.
|
11
contrib/wordnet-blast/changelog
Normal file
11
contrib/wordnet-blast/changelog
Normal file
@ -0,0 +1,11 @@
|
||||
* 0.6
|
||||
- Improve tests
|
||||
- get_synsets by pos
|
||||
- Load wordnet a bit faster
|
||||
- Fix build on Mac Os (thanks to Roman Kutlak)
|
||||
- Update doc
|
||||
- Improve testing
|
||||
* 0.5
|
||||
- get_synsets
|
||||
with morphing partially implemented (thanks to Yaron Feigin)
|
||||
- sense similarity
|
852
contrib/wordnet-blast/check/biglist.txt
Normal file
852
contrib/wordnet-blast/check/biglist.txt
Normal file
@ -0,0 +1,852 @@
|
||||
a
|
||||
able
|
||||
about
|
||||
account
|
||||
acid
|
||||
across
|
||||
act
|
||||
addition
|
||||
adjustment
|
||||
advertisement
|
||||
after
|
||||
again
|
||||
against
|
||||
agreement
|
||||
air
|
||||
all
|
||||
almost
|
||||
among
|
||||
amount
|
||||
amusement
|
||||
and
|
||||
angle
|
||||
angry
|
||||
animal
|
||||
answer
|
||||
ant
|
||||
any
|
||||
apparatus
|
||||
apple
|
||||
approval
|
||||
arch
|
||||
argument
|
||||
arm
|
||||
army
|
||||
art
|
||||
as
|
||||
at
|
||||
attack
|
||||
attempt
|
||||
attention
|
||||
attraction
|
||||
authority
|
||||
automatic
|
||||
awake
|
||||
baby
|
||||
back
|
||||
bad
|
||||
bag
|
||||
balance
|
||||
ball
|
||||
band
|
||||
base
|
||||
basin
|
||||
basket
|
||||
bath
|
||||
be
|
||||
beautiful
|
||||
because
|
||||
bed
|
||||
bee
|
||||
before
|
||||
behaviour
|
||||
belief
|
||||
bell
|
||||
bent
|
||||
berry
|
||||
between
|
||||
bird
|
||||
birth
|
||||
bit
|
||||
bite
|
||||
bitter
|
||||
black
|
||||
blade
|
||||
blood
|
||||
blow
|
||||
blue
|
||||
board
|
||||
boat
|
||||
body
|
||||
boiling
|
||||
bone
|
||||
book
|
||||
boot
|
||||
bottle
|
||||
box
|
||||
boy
|
||||
brain
|
||||
brake
|
||||
branch
|
||||
brass
|
||||
bread
|
||||
breath
|
||||
brick
|
||||
bridge
|
||||
bright
|
||||
broken
|
||||
brother
|
||||
brown
|
||||
brush
|
||||
bucket
|
||||
building
|
||||
bulb
|
||||
burn
|
||||
burst
|
||||
business
|
||||
but
|
||||
butter
|
||||
button
|
||||
by
|
||||
cake
|
||||
camera
|
||||
canvas
|
||||
card
|
||||
care
|
||||
carriage
|
||||
cart
|
||||
cat
|
||||
cause
|
||||
certain
|
||||
chain
|
||||
chalk
|
||||
chance
|
||||
change
|
||||
cheap
|
||||
cheese
|
||||
chemical
|
||||
chest
|
||||
chief
|
||||
chin
|
||||
church
|
||||
circle
|
||||
clean
|
||||
clear
|
||||
clock
|
||||
cloth
|
||||
cloud
|
||||
coal
|
||||
coat
|
||||
cold
|
||||
collar
|
||||
colour
|
||||
comb
|
||||
come
|
||||
comfort
|
||||
committee
|
||||
common
|
||||
company
|
||||
comparison
|
||||
competition
|
||||
complete
|
||||
complex
|
||||
condition
|
||||
connection
|
||||
conscious
|
||||
control
|
||||
cook
|
||||
copper
|
||||
copy
|
||||
cord
|
||||
cork
|
||||
cotton
|
||||
cough
|
||||
country
|
||||
cover
|
||||
cow
|
||||
crack
|
||||
credit
|
||||
crime
|
||||
cruel
|
||||
crush
|
||||
cry
|
||||
cup
|
||||
cup
|
||||
current
|
||||
curtain
|
||||
curve
|
||||
cushion
|
||||
damage
|
||||
danger
|
||||
dark
|
||||
daughter
|
||||
day
|
||||
dead
|
||||
dear
|
||||
death
|
||||
debt
|
||||
decision
|
||||
deep
|
||||
degree
|
||||
delicate
|
||||
dependent
|
||||
design
|
||||
desire
|
||||
destruction
|
||||
detail
|
||||
development
|
||||
different
|
||||
digestion
|
||||
direction
|
||||
dirty
|
||||
discovery
|
||||
discussion
|
||||
disease
|
||||
disgust
|
||||
distance
|
||||
distribution
|
||||
division
|
||||
do
|
||||
dog
|
||||
door
|
||||
doubt
|
||||
down
|
||||
drain
|
||||
drawer
|
||||
dress
|
||||
drink
|
||||
driving
|
||||
drop
|
||||
dry
|
||||
dust
|
||||
ear
|
||||
early
|
||||
earth
|
||||
east
|
||||
edge
|
||||
education
|
||||
effect
|
||||
egg
|
||||
elastic
|
||||
electric
|
||||
end
|
||||
engine
|
||||
enough
|
||||
equal
|
||||
error
|
||||
even
|
||||
event
|
||||
ever
|
||||
every
|
||||
example
|
||||
exchange
|
||||
existence
|
||||
expansion
|
||||
experience
|
||||
expert
|
||||
eye
|
||||
face
|
||||
fact
|
||||
fall
|
||||
false
|
||||
family
|
||||
far
|
||||
farm
|
||||
fat
|
||||
father
|
||||
fear
|
||||
feather
|
||||
feeble
|
||||
feeling
|
||||
female
|
||||
fertile
|
||||
fiction
|
||||
field
|
||||
fight
|
||||
finger
|
||||
fire
|
||||
first
|
||||
fish
|
||||
fixed
|
||||
flag
|
||||
flame
|
||||
flat
|
||||
flight
|
||||
floor
|
||||
flower
|
||||
fly
|
||||
fold
|
||||
food
|
||||
foolish
|
||||
foot
|
||||
for
|
||||
force
|
||||
fork
|
||||
form
|
||||
forward
|
||||
fowl
|
||||
frame
|
||||
free
|
||||
frequent
|
||||
friend
|
||||
from
|
||||
front
|
||||
fruit
|
||||
full
|
||||
future
|
||||
garden
|
||||
general
|
||||
get
|
||||
girl
|
||||
give
|
||||
glass
|
||||
glove
|
||||
go
|
||||
goat
|
||||
gold
|
||||
good
|
||||
government
|
||||
grain
|
||||
grass
|
||||
great
|
||||
green
|
||||
grey
|
||||
grip
|
||||
group
|
||||
growth
|
||||
guide
|
||||
gun
|
||||
hair
|
||||
hammer
|
||||
hand
|
||||
hanging
|
||||
happy
|
||||
harbour
|
||||
hard
|
||||
harmony
|
||||
hat
|
||||
hate
|
||||
have
|
||||
he
|
||||
head
|
||||
healthy
|
||||
hear
|
||||
hearing
|
||||
heart
|
||||
heat
|
||||
help
|
||||
high
|
||||
history
|
||||
hole
|
||||
hollow
|
||||
hook
|
||||
hope
|
||||
horn
|
||||
horse
|
||||
hospital
|
||||
hour
|
||||
house
|
||||
how
|
||||
humour
|
||||
I
|
||||
ice
|
||||
idea
|
||||
if
|
||||
ill
|
||||
important
|
||||
impulse
|
||||
in
|
||||
increase
|
||||
industry
|
||||
ink
|
||||
insect
|
||||
instrument
|
||||
insurance
|
||||
interest
|
||||
invention
|
||||
iron
|
||||
island
|
||||
jelly
|
||||
jewel
|
||||
join
|
||||
journey
|
||||
judge
|
||||
jump
|
||||
keep
|
||||
kettle
|
||||
key
|
||||
kick
|
||||
kind
|
||||
kiss
|
||||
knee
|
||||
knife
|
||||
knot
|
||||
knowledge
|
||||
land
|
||||
language
|
||||
last
|
||||
late
|
||||
laugh
|
||||
law
|
||||
lead
|
||||
leaf
|
||||
learning
|
||||
leather
|
||||
left
|
||||
leg
|
||||
let
|
||||
letter
|
||||
level
|
||||
library
|
||||
lift
|
||||
light
|
||||
like
|
||||
limit
|
||||
line
|
||||
linen
|
||||
lip
|
||||
liquid
|
||||
list
|
||||
little
|
||||
living
|
||||
lock
|
||||
long
|
||||
look
|
||||
loose
|
||||
loss
|
||||
loud
|
||||
love
|
||||
low
|
||||
machine
|
||||
make
|
||||
male
|
||||
man
|
||||
manager
|
||||
map
|
||||
mark
|
||||
market
|
||||
married
|
||||
mass
|
||||
match
|
||||
material
|
||||
may
|
||||
meal
|
||||
measure
|
||||
meat
|
||||
medical
|
||||
meeting
|
||||
memory
|
||||
metal
|
||||
middle
|
||||
military
|
||||
milk
|
||||
mind
|
||||
mine
|
||||
minute
|
||||
mist
|
||||
mixed
|
||||
money
|
||||
monkey
|
||||
month
|
||||
moon
|
||||
morning
|
||||
mother
|
||||
motion
|
||||
mountain
|
||||
mouth
|
||||
move
|
||||
much
|
||||
muscle
|
||||
music
|
||||
nail
|
||||
name
|
||||
narrow
|
||||
nation
|
||||
natural
|
||||
near
|
||||
necessary
|
||||
neck
|
||||
need
|
||||
needle
|
||||
nerve
|
||||
net
|
||||
new
|
||||
news
|
||||
night
|
||||
no
|
||||
noise
|
||||
normal
|
||||
north
|
||||
nose
|
||||
not
|
||||
note
|
||||
now
|
||||
number
|
||||
nut
|
||||
observation
|
||||
of
|
||||
off
|
||||
offer
|
||||
office
|
||||
oil
|
||||
old
|
||||
on
|
||||
only
|
||||
open
|
||||
operation
|
||||
opinion
|
||||
opposite
|
||||
or
|
||||
orange
|
||||
order
|
||||
organization
|
||||
ornament
|
||||
other
|
||||
out
|
||||
oven
|
||||
over
|
||||
owner
|
||||
page
|
||||
pain
|
||||
paint
|
||||
paper
|
||||
parallel
|
||||
parcel
|
||||
part
|
||||
past
|
||||
paste
|
||||
payment
|
||||
peace
|
||||
pen
|
||||
pencil
|
||||
person
|
||||
physical
|
||||
picture
|
||||
pig
|
||||
pin
|
||||
pipe
|
||||
place
|
||||
plane
|
||||
plant
|
||||
plate
|
||||
play
|
||||
please
|
||||
pleasure
|
||||
plough
|
||||
pocket
|
||||
point
|
||||
poison
|
||||
polish
|
||||
political
|
||||
poor
|
||||
porter
|
||||
position
|
||||
possible
|
||||
pot
|
||||
potato
|
||||
powder
|
||||
power
|
||||
present
|
||||
price
|
||||
print
|
||||
prison
|
||||
private
|
||||
probable
|
||||
process
|
||||
produce
|
||||
profit
|
||||
property
|
||||
prose
|
||||
protest
|
||||
public
|
||||
pull
|
||||
pump
|
||||
punishment
|
||||
purpose
|
||||
push
|
||||
put
|
||||
quality
|
||||
question
|
||||
quick
|
||||
quiet
|
||||
quite
|
||||
rail
|
||||
rain
|
||||
range
|
||||
rat
|
||||
rate
|
||||
ray
|
||||
reaction
|
||||
reading
|
||||
ready
|
||||
reason
|
||||
receipt
|
||||
record
|
||||
red
|
||||
regret
|
||||
regular
|
||||
relation
|
||||
religion
|
||||
representative
|
||||
request
|
||||
respect
|
||||
responsible
|
||||
rest
|
||||
reward
|
||||
rhythm
|
||||
rice
|
||||
right
|
||||
ring
|
||||
river
|
||||
road
|
||||
rod
|
||||
roll
|
||||
roof
|
||||
room
|
||||
root
|
||||
rough
|
||||
round
|
||||
rub
|
||||
rule
|
||||
run
|
||||
sad
|
||||
safe
|
||||
sail
|
||||
salt
|
||||
same
|
||||
sand
|
||||
say
|
||||
scale
|
||||
school
|
||||
science
|
||||
scissors
|
||||
screw
|
||||
sea
|
||||
seat
|
||||
second
|
||||
secret
|
||||
secretary
|
||||
see
|
||||
seed
|
||||
seem
|
||||
selection
|
||||
self
|
||||
send
|
||||
sense
|
||||
separate
|
||||
serious
|
||||
servant
|
||||
sex
|
||||
shade
|
||||
shake
|
||||
shame
|
||||
sharp
|
||||
sheep
|
||||
shelf
|
||||
ship
|
||||
shirt
|
||||
shock
|
||||
shoe
|
||||
short
|
||||
shut
|
||||
side
|
||||
sign
|
||||
silk
|
||||
silver
|
||||
simple
|
||||
sister
|
||||
size
|
||||
skin
|
||||
|
||||
skirt
|
||||
sky
|
||||
sleep
|
||||
slip
|
||||
slope
|
||||
slow
|
||||
small
|
||||
smash
|
||||
smell
|
||||
smile
|
||||
smoke
|
||||
smooth
|
||||
snake
|
||||
sneeze
|
||||
snow
|
||||
so
|
||||
soap
|
||||
society
|
||||
sock
|
||||
soft
|
||||
solid
|
||||
some
|
||||
|
||||
son
|
||||
song
|
||||
sort
|
||||
sound
|
||||
soup
|
||||
south
|
||||
space
|
||||
spade
|
||||
special
|
||||
sponge
|
||||
spoon
|
||||
spring
|
||||
square
|
||||
stage
|
||||
stamp
|
||||
star
|
||||
start
|
||||
statement
|
||||
station
|
||||
steam
|
||||
steel
|
||||
stem
|
||||
step
|
||||
stick
|
||||
sticky
|
||||
stiff
|
||||
still
|
||||
stitch
|
||||
stocking
|
||||
stomach
|
||||
stone
|
||||
stop
|
||||
store
|
||||
story
|
||||
straight
|
||||
strange
|
||||
street
|
||||
stretch
|
||||
strong
|
||||
structure
|
||||
substance
|
||||
such
|
||||
sudden
|
||||
sugar
|
||||
suggestion
|
||||
summer
|
||||
sun
|
||||
support
|
||||
surprise
|
||||
sweet
|
||||
swim
|
||||
system
|
||||
table
|
||||
tail
|
||||
take
|
||||
talk
|
||||
tall
|
||||
taste
|
||||
tax
|
||||
teaching
|
||||
tendency
|
||||
test
|
||||
than
|
||||
that
|
||||
the
|
||||
then
|
||||
theory
|
||||
there
|
||||
thick
|
||||
thin
|
||||
thing
|
||||
this
|
||||
thought
|
||||
thread
|
||||
throat
|
||||
through
|
||||
through
|
||||
thumb
|
||||
thunder
|
||||
ticket
|
||||
tight
|
||||
till
|
||||
time
|
||||
tin
|
||||
tired
|
||||
to
|
||||
toe
|
||||
together
|
||||
tomorrow
|
||||
tongue
|
||||
tooth
|
||||
top
|
||||
touch
|
||||
town
|
||||
trade
|
||||
train
|
||||
transport
|
||||
tray
|
||||
tree
|
||||
trick
|
||||
trouble
|
||||
trousers
|
||||
true
|
||||
turn
|
||||
twist
|
||||
umbrella
|
||||
under
|
||||
unit
|
||||
up
|
||||
use
|
||||
value
|
||||
verse
|
||||
very
|
||||
vessel
|
||||
view
|
||||
violent
|
||||
voice
|
||||
waiting
|
||||
walk
|
||||
wall
|
||||
war
|
||||
warm
|
||||
wash
|
||||
waste
|
||||
watch
|
||||
water
|
||||
wave
|
||||
wax
|
||||
way
|
||||
weather
|
||||
week
|
||||
weight
|
||||
well
|
||||
west
|
||||
wet
|
||||
wheel
|
||||
when
|
||||
where
|
||||
while
|
||||
whip
|
||||
whistle
|
||||
white
|
||||
who
|
||||
why
|
||||
wide
|
||||
will
|
||||
wind
|
||||
window
|
||||
wine
|
||||
wing
|
||||
winter
|
||||
wire
|
||||
wise
|
||||
with
|
||||
woman
|
||||
wood
|
||||
wool
|
||||
word
|
||||
work
|
||||
worm
|
||||
wound
|
||||
writing
|
||||
wrong
|
||||
year
|
||||
yellow
|
||||
yes
|
||||
yesterday
|
||||
you
|
||||
young
|
16
contrib/wordnet-blast/check/check.sh
Normal file
16
contrib/wordnet-blast/check/check.sh
Normal file
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
WNHOME=/usr/share/wordnet/
|
||||
|
||||
check() {
|
||||
local word_list="$1"
|
||||
echo "./bin/wntest $WNHOME ${word_list}"
|
||||
time ./bin/wntest $WNHOME ${word_list} > ${word_list}.blast
|
||||
echo "for i in \`cat ${word_list}\`; do wn $i -over; done"
|
||||
time for i in `cat ${word_list}`; do wn $i -over; done > ${word_list}.wn
|
||||
|
||||
echo "diff ${word_list}.wn ${word_list}.blast -b"
|
||||
colordiff -y ${word_list}.wn ${word_list}.blast -b
|
||||
}
|
||||
|
||||
check "$1"
|
7
contrib/wordnet-blast/check/list.txt
Normal file
7
contrib/wordnet-blast/check/list.txt
Normal file
@ -0,0 +1,7 @@
|
||||
cat
|
||||
lions
|
||||
city
|
||||
building
|
||||
salvation
|
||||
medications
|
||||
haven
|
72
contrib/wordnet-blast/wnb/bfs.hh
Normal file
72
contrib/wordnet-blast/wnb/bfs.hh
Normal file
@ -0,0 +1,72 @@
|
||||
#ifndef _BFS_HH
|
||||
# define _BFS_HH
|
||||
|
||||
# include <boost/graph/breadth_first_search.hpp>
|
||||
# include <boost/graph/filtered_graph.hpp>
|
||||
|
||||
namespace wnb
|
||||
{
|
||||
struct synset;
|
||||
|
||||
namespace bfs // breadth first search tools
|
||||
{
|
||||
/// bfs_visitor
|
||||
/// Sum distances and throw answer if target synset found
|
||||
template <typename DistanceMap>
|
||||
class distance_recorder : public boost::default_bfs_visitor
|
||||
{
|
||||
public:
|
||||
distance_recorder(DistanceMap dist, const synset& s, int max)
|
||||
: d(dist), target(s), max_length(max)
|
||||
{ }
|
||||
|
||||
template <typename Edge, typename Graph>
|
||||
void tree_edge(Edge e, const Graph& g) const
|
||||
{
|
||||
typename boost::graph_traits<Graph>::vertex_descriptor
|
||||
u = boost::source(e, g), v = boost::target(e, g);
|
||||
d[v] = d[u] + 1;
|
||||
|
||||
if (g[v] == target)
|
||||
throw d[v];
|
||||
if (d[v] > max_length)
|
||||
throw -1;
|
||||
}
|
||||
private:
|
||||
DistanceMap d;
|
||||
const synset& target;
|
||||
int max_length;
|
||||
};
|
||||
|
||||
/// Convenience function
|
||||
template <typename DistanceMap>
|
||||
distance_recorder<DistanceMap>
|
||||
record_distance(DistanceMap d, const synset& s, int m)
|
||||
{
|
||||
return distance_recorder<DistanceMap>(d, s, m);
|
||||
}
|
||||
|
||||
/// This predicate function object determines which edges of the original
|
||||
/// graph will show up in the filtered graph.
|
||||
//FIXME: Do we really need a map here (check cost of property_map construction
|
||||
// / should be light)
|
||||
template <typename PointerSymbolMap>
|
||||
struct hypo_hyper_edge {
|
||||
hypo_hyper_edge() { }
|
||||
hypo_hyper_edge(PointerSymbolMap pointer_symbol)
|
||||
: m_pointer_symbol(pointer_symbol) { }
|
||||
template <typename Edge>
|
||||
bool operator()(const Edge& e) const {
|
||||
int p_s = get(m_pointer_symbol, e);
|
||||
//see pointer symbol list in info_helper.hh
|
||||
return p_s == 1 || p_s == 2 || p_s == 3 || p_s == 4;
|
||||
}
|
||||
PointerSymbolMap m_pointer_symbol;
|
||||
};
|
||||
|
||||
} // end of wnb::bfs
|
||||
|
||||
} // end of namespace wnb
|
||||
|
||||
#endif /* _BFS_HH */
|
||||
|
148
contrib/wordnet-blast/wnb/core/info_helper.cc
Normal file
148
contrib/wordnet-blast/wnb/core/info_helper.cc
Normal file
@ -0,0 +1,148 @@
|
||||
#include "info_helper.hh"
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <map>
|
||||
|
||||
#include <cassert>
|
||||
|
||||
namespace wnb
|
||||
{
|
||||
|
||||
// Class info_helper
|
||||
|
||||
/// List of pointer symbols
|
||||
const char *
|
||||
info_helper::symbols[info_helper::NB_SYMBOLS] = {
|
||||
"!" , // 0 Antonym
|
||||
"@" , // 1 Hypernym
|
||||
"@i", // 2 Instance Hypernym
|
||||
"~" , // 3 Hyponym
|
||||
"~i", // 4 Instance Hyponym
|
||||
"#m", // 5 Member holonym
|
||||
"#s", // 6 Substance holonym
|
||||
"#p", // 7 Part holonym
|
||||
"%m", // 8 Member meronym
|
||||
"%s", // 9 Substance meronym
|
||||
"%p", // 10 Part meronym
|
||||
"=" , // 11 Attribute
|
||||
"+" , // 12 Derivationally related form
|
||||
";c", // 13 Domain of synset - TOPIC
|
||||
"-c", // 14 Member of this domain - TOPIC
|
||||
";r", // 15 Domain of synset - REGION
|
||||
"-r", // 16 Member of this domain - REGION
|
||||
";u", // 17 Domain of synset - USAGE
|
||||
"-u", // 18 Member of this domain - USAGE
|
||||
|
||||
//The pointer_symbol s for verbs are:
|
||||
"*", // 19 Entailment
|
||||
">", // 20 Cause
|
||||
"^", // 21 Also see
|
||||
"$", // 22 Verb Group
|
||||
|
||||
//The pointer_symbol s for adjectives are:
|
||||
"&", // 23 Similar to
|
||||
"<", // 24 Participle of verb
|
||||
"\\", // 25 Pertainym (pertains to noun)
|
||||
"=", // 26 Attribute
|
||||
};
|
||||
|
||||
const std::string info_helper::sufx[] = {
|
||||
/* Noun suffixes */
|
||||
"s", "ses", "xes", "zes", "ches", "shes", "men", "ies",
|
||||
/* Verb suffixes */
|
||||
"s", "ies", "es", "es", "ed", "ed", "ing", "ing",
|
||||
/* Adjective suffixes */
|
||||
"er", "est", "er", "est"
|
||||
};
|
||||
|
||||
const std::string info_helper::addr[] = {
|
||||
/* Noun endings */
|
||||
"", "s", "x", "z", "ch", "sh", "man", "y",
|
||||
/* Verb endings */
|
||||
"", "y", "e", "", "e", "", "e", "",
|
||||
/* Adjective endings */
|
||||
"", "", "e", "e"
|
||||
};
|
||||
|
||||
const int info_helper::offsets[info_helper::NUMPARTS] = { 0, 0, 8, 16, 0, 0 };
|
||||
const int info_helper::cnts[info_helper::NUMPARTS] = { 0, 8, 8, 4, 0, 0 };
|
||||
|
||||
void
|
||||
info_helper::update_pos_maps()
|
||||
{
|
||||
// http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3
|
||||
|
||||
indice_offset[UNKNOWN] = 0;
|
||||
|
||||
indice_offset[N] = 0;
|
||||
indice_offset[V] = indice_offset[N] + pos_maps[N].size();
|
||||
indice_offset[A] = indice_offset[V] + pos_maps[V].size();
|
||||
indice_offset[R] = indice_offset[A] + pos_maps[A].size();
|
||||
indice_offset[S] = indice_offset[R] + pos_maps[R].size();
|
||||
|
||||
}
|
||||
|
||||
int info_helper::compute_indice(int offset, pos_t pos)
|
||||
{
|
||||
if (pos == S)
|
||||
pos = A;
|
||||
std::map<int,int>& map = pos_maps[pos];
|
||||
|
||||
assert(pos <= 5 && pos > 0);
|
||||
|
||||
return indice_offset[pos] + map[offset];
|
||||
}
|
||||
|
||||
// Function definitions
|
||||
|
||||
// Return relation between synset indices and offsets
|
||||
static
|
||||
std::map<int,int>
|
||||
preprocess_data(const std::string& fn)
|
||||
{
|
||||
std::map<int,int> map;
|
||||
std::ifstream file(fn.c_str());
|
||||
if (!file.is_open())
|
||||
throw std::runtime_error("preprocess_data: File not found: " + fn);
|
||||
|
||||
std::string row;
|
||||
|
||||
//skip header
|
||||
const unsigned int header_nb_lines = 29;
|
||||
for(std::size_t i = 0; i < header_nb_lines; i++)
|
||||
std::getline(file, row);
|
||||
|
||||
int ind = 0;
|
||||
//parse data line
|
||||
while (std::getline(file, row))
|
||||
{
|
||||
std::stringstream srow(row);
|
||||
int offset;
|
||||
srow >> offset;
|
||||
map.insert(std::pair<int,int>(offset, ind));
|
||||
ind++;
|
||||
}
|
||||
|
||||
file.close();
|
||||
return map;
|
||||
}
|
||||
|
||||
info_helper
|
||||
preprocess_wordnet(const std::string& dn)
|
||||
{
|
||||
info_helper info;
|
||||
|
||||
info.pos_maps[N] = preprocess_data((dn + "data.noun")); // noun_map
|
||||
info.pos_maps[V] = preprocess_data((dn + "data.verb")); // verb_map
|
||||
info.pos_maps[A] = preprocess_data((dn + "data.adj")); // adj_map
|
||||
info.pos_maps[R] = preprocess_data((dn + "data.adv")); // adv_map
|
||||
|
||||
info.update_pos_maps();
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
} // end of namespace wnb
|
||||
|
85
contrib/wordnet-blast/wnb/core/info_helper.hh
Normal file
85
contrib/wordnet-blast/wnb/core/info_helper.hh
Normal file
@ -0,0 +1,85 @@
|
||||
#pragma once
|
||||
|
||||
# include <string>
|
||||
# include <stdexcept>
|
||||
# include <map>
|
||||
|
||||
# include "pos_t.hh"
|
||||
|
||||
namespace wnb
|
||||
{
|
||||
|
||||
/// Useful information for wordnet in-memory import
|
||||
struct info_helper
|
||||
{
|
||||
/// Symbols' size
|
||||
static const std::size_t NB_SYMBOLS = 27;
|
||||
static const std::size_t NUMPARTS = POS_ARRAY_SIZE;
|
||||
|
||||
/// List of pointer symbols
|
||||
static const char * symbols[NB_SYMBOLS];
|
||||
static const std::string sufx[];
|
||||
static const std::string addr[];
|
||||
|
||||
static const int offsets[NUMPARTS];
|
||||
static const int cnts[NUMPARTS];
|
||||
|
||||
typedef std::map<int,int> i2of_t; ///< indice/offset correspondences
|
||||
typedef std::map<pos_t, i2of_t> pos_i2of_t; ///< pos / map correspondences
|
||||
|
||||
/// Constructor
|
||||
info_helper() { update_pos_maps(); }
|
||||
|
||||
/// Compute the number of synsets (i.e. the number of vertex in the graph)
|
||||
unsigned nb_synsets()
|
||||
{
|
||||
typedef pos_i2of_t::iterator iter_t;
|
||||
|
||||
int sum = 0;
|
||||
for (iter_t it = pos_maps.begin(); it != pos_maps.end(); it++)
|
||||
sum += (*it).second.size();
|
||||
|
||||
return sum;
|
||||
//return adj_map.size() + adv_map.size() + noun_map.size() + verb_map.size();
|
||||
}
|
||||
|
||||
// Given a pos return the starting indice in the graph
|
||||
int get_indice_offset(pos_t pos)
|
||||
{
|
||||
return indice_offset[pos];
|
||||
}
|
||||
|
||||
/// Helper function computing global indice in graph from local offset
|
||||
int compute_indice(int offset, pos_t pos);
|
||||
|
||||
/// Update a map allowing one to get the correct map given a pos
|
||||
void update_pos_maps();
|
||||
|
||||
int get_symbol(const std::string& ps)
|
||||
{
|
||||
for (unsigned i = 0; i < NB_SYMBOLS; i++)
|
||||
if (ps == symbols[i])
|
||||
return i;
|
||||
throw std::runtime_error("Symbol NOT FOUND.");
|
||||
}
|
||||
|
||||
pos_t get_pos(const char& c)
|
||||
{
|
||||
return get_pos_from_char(c);
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
// i2of_t adj_map;
|
||||
// i2of_t adv_map;
|
||||
// i2of_t noun_map;
|
||||
// i2of_t verb_map;
|
||||
|
||||
pos_i2of_t pos_maps;
|
||||
std::size_t indice_offset[POS_ARRAY_SIZE];
|
||||
};
|
||||
|
||||
/// Create a new info_help based on wordnet data located in dn (../dict/)
|
||||
info_helper preprocess_wordnet(const std::string& dn);
|
||||
|
||||
} // end of namespace wncpp
|
381
contrib/wordnet-blast/wnb/core/load_wordnet.cc
Normal file
381
contrib/wordnet-blast/wnb/core/load_wordnet.cc
Normal file
@ -0,0 +1,381 @@
|
||||
#include "load_wordnet.hh"
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
#include <boost/graph/adjacency_list.hpp>
|
||||
#include <boost/progress.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
#include <wnb/std_ext.hh>
|
||||
|
||||
#include "wordnet.hh"
|
||||
#include "info_helper.hh"
|
||||
#include "pos_t.hh"
|
||||
|
||||
namespace bg = boost::graph;
|
||||
|
||||
namespace wnb
|
||||
{
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
// Load synset's words
|
||||
void load_data_row_words(std::stringstream& srow, synset& synset)
|
||||
{
|
||||
srow >> std::hex >> synset.w_cnt >> std::dec;
|
||||
for (std::size_t i = 0; i < synset.w_cnt; i++)
|
||||
{
|
||||
//word lex_id
|
||||
|
||||
std::string word;
|
||||
srow >> word;
|
||||
synset.words.push_back(word);
|
||||
|
||||
int lex_id;
|
||||
srow >> std::hex >> lex_id >> std::dec;
|
||||
synset.lex_ids.push_back(lex_id);
|
||||
}
|
||||
}
|
||||
|
||||
// Add rel to graph
|
||||
void add_wordnet_rel(std::string& pointer_symbol_,// type of relation
|
||||
int synset_offset, // dest offset
|
||||
pos_t pos, // p.o.s. of dest
|
||||
int src, // word src
|
||||
int trgt, // word target
|
||||
synset& synset, // source synset
|
||||
wordnet& wn, // our wordnet
|
||||
info_helper& info) // helper
|
||||
{
|
||||
//if (pos == S || synset.pos == S)
|
||||
// return; //FIXME: check where are s synsets.
|
||||
|
||||
int u = synset.id;
|
||||
int v = info.compute_indice(synset_offset, pos);
|
||||
|
||||
ptr p;
|
||||
p.pointer_symbol = info.get_symbol(pointer_symbol_);
|
||||
p.source = src;
|
||||
p.target = trgt;
|
||||
|
||||
boost::add_edge(u,v, p, wn.wordnet_graph);
|
||||
}
|
||||
|
||||
|
||||
// load ptrs
|
||||
void load_data_row_ptrs(std::stringstream& srow, synset& synset,
|
||||
wordnet& wn, info_helper& info)
|
||||
{
|
||||
srow >> synset.p_cnt;
|
||||
for (std::size_t i = 0; i < synset.p_cnt; i++)
|
||||
{
|
||||
//http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3
|
||||
//pointer_symbol synset_offset pos source/target
|
||||
std::string pointer_symbol_;
|
||||
int synset_offset;
|
||||
pos_t pos;
|
||||
int src;
|
||||
int trgt;
|
||||
|
||||
srow >> pointer_symbol_;
|
||||
srow >> synset_offset;
|
||||
|
||||
char c;
|
||||
srow >> c;
|
||||
pos = info.get_pos(c);
|
||||
|
||||
//print extracted edges
|
||||
//std::cout << "(" << pointer_symbol << ", " << synset_offset;
|
||||
//std::cout << ", " << pos << ")" << std::endl;
|
||||
|
||||
// Extract source/target words info
|
||||
std::string src_trgt;
|
||||
srow >> src_trgt;
|
||||
std::stringstream ssrc(std::string(src_trgt,0,2));
|
||||
std::stringstream strgt(std::string(src_trgt,2,2));
|
||||
ssrc >> std::hex >> src >> std::dec;
|
||||
strgt >> std::hex >> trgt >> std::dec;
|
||||
|
||||
add_wordnet_rel(pointer_symbol_, synset_offset, pos, src, trgt, synset, wn, info);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Load a synset and add it to the wordnet class.
|
||||
void load_data_row(const std::string& row, wordnet& wn, info_helper& info)
|
||||
{
|
||||
//http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3
|
||||
// synset_offset lex_filenum ss_type w_cnt word lex_id [word lex_id...] p_cnt [ptr...] [frames...] | gloss
|
||||
synset synset;
|
||||
|
||||
std::stringstream srow(row);
|
||||
int synset_offset;
|
||||
srow >> synset_offset;
|
||||
srow >> synset.lex_filenum;
|
||||
char ss_type;
|
||||
srow >> ss_type;
|
||||
|
||||
// extra information
|
||||
synset.pos = info.get_pos(ss_type);
|
||||
synset.id = info.compute_indice(synset_offset, synset.pos);
|
||||
|
||||
// words
|
||||
load_data_row_words(srow, synset);
|
||||
|
||||
// ptrs
|
||||
load_data_row_ptrs(srow, synset, wn, info);
|
||||
|
||||
//frames (skipped)
|
||||
std::string tmp;
|
||||
while (srow >> tmp)
|
||||
if (tmp == "|")
|
||||
break;
|
||||
|
||||
// gloss
|
||||
std::getline(srow, synset.gloss);
|
||||
|
||||
// extra
|
||||
synset.sense_number = 0;
|
||||
|
||||
// Add synset to graph
|
||||
wn.wordnet_graph[synset.id] = synset;
|
||||
}
|
||||
|
||||
|
||||
// Parse data.noun files
|
||||
void load_wordnet_data(const std::string& fn, wordnet& wn, info_helper& info)
|
||||
{
|
||||
std::ifstream fin(fn.c_str());
|
||||
if (!fin.is_open())
|
||||
throw std::runtime_error("File missing: " + fn);
|
||||
|
||||
static const int MAX_LENGTH = 20480;
|
||||
char row[MAX_LENGTH];
|
||||
|
||||
//skip header
|
||||
for(unsigned i = 0; i < 29; i++)
|
||||
fin.getline(row, MAX_LENGTH);
|
||||
|
||||
//parse data line
|
||||
while (fin.getline(row, MAX_LENGTH))
|
||||
load_data_row(row, wn, info);
|
||||
|
||||
fin.close();
|
||||
}
|
||||
|
||||
|
||||
//FIXME: It seems possible to replace synset_offsets with indice here.
|
||||
void load_index_row(const std::string& row, wordnet& wn, info_helper& info)
|
||||
{
|
||||
// lemma pos synset_cnt p_cnt [ptr_symbol...] sense_cnt tagsense_cnt synset_offset [synset_offset...]
|
||||
index index;
|
||||
std::stringstream srow(row);
|
||||
|
||||
char pos;
|
||||
srow >> index.lemma;
|
||||
srow >> pos;
|
||||
index.pos = info.get_pos(pos); // extra data
|
||||
srow >> index.synset_cnt;
|
||||
srow >> index.p_cnt;
|
||||
|
||||
std::string tmp_p;
|
||||
for (std::size_t i = 0; i < index.p_cnt; i++)
|
||||
{
|
||||
srow >> tmp_p;
|
||||
index.ptr_symbols.push_back(tmp_p);
|
||||
}
|
||||
srow >> index.sense_cnt;
|
||||
srow >> index.tagsense_cnt;
|
||||
|
||||
int tmp_o;
|
||||
while (srow >> tmp_o)
|
||||
{
|
||||
index.synset_offsets.push_back(tmp_o);
|
||||
index.synset_ids.push_back(info.compute_indice(tmp_o, index.pos)); // extra data
|
||||
}
|
||||
|
||||
//add synset to index list
|
||||
wn.index_list.push_back(index);
|
||||
}
|
||||
|
||||
|
||||
void load_wordnet_index(const std::string& fn, wordnet& wn, info_helper& info)
|
||||
{
|
||||
std::ifstream fin(fn.c_str());
|
||||
if (!fin.is_open())
|
||||
throw std::runtime_error("File Not Found: " + fn);
|
||||
|
||||
static const int MAX_LENGTH = 20480;
|
||||
char row[MAX_LENGTH];
|
||||
|
||||
//skip header
|
||||
const unsigned int header_nb_lines = 29;
|
||||
for(std::size_t i = 0; i < header_nb_lines; i++)
|
||||
fin.getline(row, MAX_LENGTH);
|
||||
|
||||
//parse data line
|
||||
while (fin.getline(row, MAX_LENGTH))
|
||||
load_index_row(row, wn, info);
|
||||
|
||||
fin.close();
|
||||
}
|
||||
|
||||
|
||||
void load_wordnet_exc(const std::string& dn, std::string cat,
|
||||
wordnet& wn, info_helper&)
|
||||
{
|
||||
std::string fn = dn + cat + ".exc";
|
||||
std::ifstream fin(fn.c_str());
|
||||
if (!fin.is_open())
|
||||
throw std::runtime_error("File Not Found: " + fn);
|
||||
|
||||
std::map<std::string,std::string>& exc = wn.exc[get_pos_from_name(cat)];
|
||||
|
||||
std::string row;
|
||||
|
||||
std::string key, value;
|
||||
while (std::getline(fin, row))
|
||||
{
|
||||
std::stringstream srow(row);
|
||||
srow >> key;
|
||||
srow >> value;
|
||||
|
||||
exc[key] = value;
|
||||
}
|
||||
}
|
||||
|
||||
void load_wordnet_cat(const std::string dn, std::string cat,
|
||||
wordnet& wn, info_helper& info)
|
||||
{
|
||||
load_wordnet_data((dn + "data." + cat), wn, info);
|
||||
load_wordnet_index((dn + "index." + cat), wn, info);
|
||||
load_wordnet_exc(dn, cat, wn, info);
|
||||
}
|
||||
|
||||
// FIXME: this file is not in all packaged version of wordnet
|
||||
void load_wordnet_index_sense(const std::string& dn, wordnet& wn, info_helper& info)
|
||||
{
|
||||
std::string fn = dn + "index.sense";
|
||||
std::ifstream fin(fn.c_str());
|
||||
if (!fin.is_open())
|
||||
throw std::runtime_error("File Not Found: " + fn);
|
||||
|
||||
std::string row;
|
||||
std::string sense_key;
|
||||
int synset_offset;
|
||||
while (std::getline(fin, row))
|
||||
{
|
||||
std::stringstream srow(row);
|
||||
srow >> sense_key;
|
||||
|
||||
// Get the pos of the lemma
|
||||
std::vector<std::string> sk = ext::split(sense_key,'%');
|
||||
std::string word = sk.at(0);
|
||||
std::stringstream tmp(ext::split(sk.at(1), ':').at(0));
|
||||
int ss_type;
|
||||
tmp >> ss_type;
|
||||
pos_t pos = (pos_t) ss_type;
|
||||
|
||||
srow >> synset_offset;
|
||||
|
||||
// Update synset info
|
||||
int u = info.compute_indice(synset_offset, pos);
|
||||
int sense_number;
|
||||
srow >> sense_number;
|
||||
wn.wordnet_graph[u].sense_number += sense_number;
|
||||
int tag_cnt;
|
||||
srow >> tag_cnt;
|
||||
if (tag_cnt != 0)
|
||||
wn.wordnet_graph[u].tag_cnts.push_back( make_pair(word,tag_cnt) );
|
||||
|
||||
//if (synset_offset == 2121620)
|
||||
// std::cout << u << " " << word << " " << synset_offset << " "
|
||||
// << wn.wordnet_graph[u].tag_cnt << " "
|
||||
// << wn.wordnet_graph[u].words[0] << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// wn -over used info in cntlist even if this is deprecated
|
||||
// It is ok not to FIX and use this function
|
||||
void load_wordnet_cntlist(const std::string& dn, wordnet& wn, info_helper& info)
|
||||
{
|
||||
std::string fn = dn + "cntlist";
|
||||
std::ifstream fin(fn.c_str());
|
||||
if (!fin.is_open())
|
||||
throw std::runtime_error("File Not Found: " + fn);
|
||||
|
||||
std::string sense_key;
|
||||
int sense_number;
|
||||
int tag_cnt;
|
||||
|
||||
std::string row;
|
||||
while (std::getline(fin, row))
|
||||
{
|
||||
std::stringstream srow(row);
|
||||
|
||||
srow >> sense_key;
|
||||
srow >> sense_number;
|
||||
srow >> tag_cnt;
|
||||
|
||||
// Get the pos of the lemma
|
||||
std::string word = ext::split(sense_key,'%').at(0);
|
||||
std::stringstream tmp(ext::split(ext::split(sense_key,'%').at(1), ':').at(0));
|
||||
int ss_type;
|
||||
tmp >> ss_type;
|
||||
pos_t pos = (pos_t) ss_type;
|
||||
|
||||
// Update synset info
|
||||
int synset_offset; // FIXME
|
||||
int u = info.compute_indice(synset_offset, pos);
|
||||
wn.wordnet_graph[u].sense_number += sense_number;
|
||||
if (tag_cnt != 0)
|
||||
wn.wordnet_graph[u].tag_cnts.push_back( make_pair(word,tag_cnt) );
|
||||
}
|
||||
}
|
||||
|
||||
} // end of anonymous namespace
|
||||
|
||||
void load_wordnet(const std::string& dn, wordnet& wn, info_helper& info)
|
||||
{
|
||||
// vertex added in this order a n r v
|
||||
|
||||
std::string fn = dn;
|
||||
|
||||
if (wn._verbose)
|
||||
{
|
||||
std::cout << std::endl;
|
||||
std::cout << "### Loading Wordnet 3.0";
|
||||
boost::progress_display show_progress(5);
|
||||
boost::progress_timer t;
|
||||
|
||||
load_wordnet_cat(dn, "adj", wn, info);
|
||||
++show_progress;
|
||||
load_wordnet_cat(dn, "noun", wn, info);
|
||||
++show_progress;
|
||||
load_wordnet_cat(dn, "adv", wn, info);
|
||||
++show_progress;
|
||||
load_wordnet_cat(dn, "verb", wn, info);
|
||||
++show_progress;
|
||||
load_wordnet_index_sense(dn, wn, info);
|
||||
++show_progress;
|
||||
std::cout << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
load_wordnet_cat(dn, "adj", wn, info);
|
||||
load_wordnet_cat(dn, "noun", wn, info);
|
||||
load_wordnet_cat(dn, "adv", wn, info);
|
||||
load_wordnet_cat(dn, "verb", wn, info);
|
||||
load_wordnet_index_sense(dn, wn, info);
|
||||
}
|
||||
|
||||
std::stable_sort(wn.index_list.begin(), wn.index_list.end());
|
||||
}
|
||||
|
||||
} // end of namespace wnb
|
12
contrib/wordnet-blast/wnb/core/load_wordnet.hh
Normal file
12
contrib/wordnet-blast/wnb/core/load_wordnet.hh
Normal file
@ -0,0 +1,12 @@
|
||||
#pragma once
|
||||
|
||||
# include "info_helper.hh"
|
||||
|
||||
namespace wnb
|
||||
{
|
||||
/// forward declaration
|
||||
struct wordnet;
|
||||
|
||||
/// Load the entire wordnet data base located in \p dn (typically .../dict/)
|
||||
void load_wordnet(const std::string& dn, wordnet& wn, info_helper& info);
|
||||
}
|
61
contrib/wordnet-blast/wnb/core/pos_t.hh
Normal file
61
contrib/wordnet-blast/wnb/core/pos_t.hh
Normal file
@ -0,0 +1,61 @@
|
||||
#pragma once
|
||||
|
||||
namespace wnb
|
||||
{
|
||||
|
||||
static const std::size_t POS_ARRAY_SIZE = 6;
|
||||
static const char POS_ARRAY[POS_ARRAY_SIZE] = {'u', 'n', 'v', 'a', 'r', 's'};
|
||||
|
||||
enum pos_t
|
||||
{
|
||||
UNKNOWN = 0,
|
||||
N = 1,
|
||||
V = 2,
|
||||
A = 3,
|
||||
R = 4,
|
||||
S = 5,
|
||||
};
|
||||
|
||||
|
||||
inline pos_t get_pos_from_name(const std::string& pos)
|
||||
{
|
||||
if (pos == "adj")
|
||||
return A;
|
||||
if (pos == "noun")
|
||||
return N;
|
||||
if (pos == "adv")
|
||||
return R;
|
||||
if (pos == "verb")
|
||||
return V;
|
||||
if (pos == "adj sat")
|
||||
return S;
|
||||
return UNKNOWN;
|
||||
}
|
||||
|
||||
inline std::string get_name_from_pos(const pos_t& pos)
|
||||
{
|
||||
switch (pos)
|
||||
{
|
||||
case A: return "adj";
|
||||
case N: return "noun";
|
||||
case R: return "adv";
|
||||
case V: return "verb";
|
||||
case S: return "adj sat";
|
||||
default: return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
|
||||
inline pos_t get_pos_from_char(const char& c)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
case 'a': return A;
|
||||
case 'n': return N;
|
||||
case 'r': return R;
|
||||
case 'v': return V;
|
||||
case 's': return S;
|
||||
default: return UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
} // end of namespace wncpp
|
186
contrib/wordnet-blast/wnb/core/wordnet.cc
Normal file
186
contrib/wordnet-blast/wnb/core/wordnet.cc
Normal file
@ -0,0 +1,186 @@
|
||||
#include <wnb/core/wordnet.hh>
|
||||
#include <wnb/std_ext.hh>
|
||||
|
||||
#include <string>
|
||||
#include <set>
|
||||
#include <algorithm>
|
||||
#include <stdexcept>
|
||||
#include <boost/graph/breadth_first_search.hpp>
|
||||
#include <boost/graph/filtered_graph.hpp>
|
||||
|
||||
namespace wnb
|
||||
{
|
||||
|
||||
//FIXME: Make (smart) use of fs::path
|
||||
wordnet::wordnet(const std::string& wordnet_dir, bool verbose)
|
||||
: _verbose(verbose)
|
||||
{
|
||||
if (_verbose)
|
||||
{
|
||||
std::cout << wordnet_dir << std::endl;
|
||||
}
|
||||
|
||||
info = preprocess_wordnet(wordnet_dir);
|
||||
|
||||
wordnet_graph = graph(info.nb_synsets());
|
||||
load_wordnet(wordnet_dir, *this, info);
|
||||
|
||||
if (_verbose)
|
||||
{
|
||||
std::cout << "nb_synsets: " << info.nb_synsets() << std::endl;
|
||||
}
|
||||
//FIXME: this check is only valid for Wordnet 3.0
|
||||
//assert(info.nb_synsets() == 142335);//117659);
|
||||
assert(info.nb_synsets() > 0);
|
||||
}
|
||||
|
||||
std::vector<synset>
|
||||
wordnet::get_synsets(const std::string& word, pos_t pos)
|
||||
{
|
||||
std::vector<synset> synsets;
|
||||
|
||||
// morphing
|
||||
std::string mword = morphword(word, pos);
|
||||
if (mword == "")
|
||||
return synsets;
|
||||
|
||||
// binary_search
|
||||
typedef std::vector<index> vi;
|
||||
std::pair<vi::iterator,vi::iterator> bounds = get_indexes(mword);
|
||||
|
||||
vi::iterator it;
|
||||
for (it = bounds.first; it != bounds.second; it++)
|
||||
{
|
||||
if (pos == pos_t::UNKNOWN || it->pos == pos)
|
||||
{
|
||||
for (std::size_t i = 0; i < it->synset_ids.size(); i++)
|
||||
{
|
||||
int id = it->synset_ids[i];
|
||||
synsets.push_back(wordnet_graph[id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return synsets;
|
||||
}
|
||||
|
||||
const std::vector<std::string> *
|
||||
wordnet::get_synset(const std::string& word, pos_t pos) const {
|
||||
|
||||
typedef std::vector<index> vi;
|
||||
std::pair<vi::const_iterator,vi::const_iterator> bounds = get_indexes_const(word);
|
||||
|
||||
for (vi::const_iterator it = bounds.first; it != bounds.second; it++)
|
||||
{
|
||||
if (pos == pos_t::UNKNOWN || it->pos == pos)
|
||||
{
|
||||
int id = it->synset_ids[0];
|
||||
return &wordnet_graph[id].words;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
std::pair<std::vector<index>::const_iterator, std::vector<index>::const_iterator>
|
||||
wordnet::get_indexes_const(const std::string& word) const
|
||||
{
|
||||
index light_index;
|
||||
light_index.lemma = word;
|
||||
|
||||
typedef std::vector<index> vi;
|
||||
std::pair<vi::const_iterator,vi::const_iterator> bounds =
|
||||
std::equal_range(index_list.begin(), index_list.end(), light_index);
|
||||
|
||||
return bounds;
|
||||
}
|
||||
|
||||
std::pair<std::vector<index>::iterator, std::vector<index>::iterator>
|
||||
wordnet::get_indexes(const std::string& word)
|
||||
{
|
||||
index light_index;
|
||||
light_index.lemma = word;
|
||||
|
||||
typedef std::vector<index> vi;
|
||||
std::pair<vi::iterator,vi::iterator> bounds =
|
||||
std::equal_range(index_list.begin(), index_list.end(), light_index);
|
||||
|
||||
return bounds;
|
||||
}
|
||||
|
||||
std::string
|
||||
wordnet::wordbase(const std::string& word, int ender)
|
||||
{
|
||||
if (ext::ends_with(word, info.sufx[ender]))
|
||||
{
|
||||
int sufxlen = info.sufx[ender].size();
|
||||
std::string strOut = word.substr(0, word.size() - sufxlen);
|
||||
if (!info.addr[ender].empty())
|
||||
strOut += info.addr[ender];
|
||||
return strOut;
|
||||
}
|
||||
return word;
|
||||
}
|
||||
|
||||
bool is_defined(const std::string& word, pos_t pos)
|
||||
{
|
||||
// hack FIXME: Some verbs are built with -e suffix ('builde' is just an example).
|
||||
if (pos == V && word == "builde")
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Try to find baseform (lemma) of individual word in POS
|
||||
std::string
|
||||
wordnet::morphword(const std::string& word, pos_t pos)
|
||||
{
|
||||
// first look for word on exception list
|
||||
exc_t::iterator it = exc[pos].find(word);
|
||||
if (it != exc[pos].end())
|
||||
return it->second; // found in exception list
|
||||
|
||||
std::string tmpbuf;
|
||||
std::string end;
|
||||
int cnt = 0;
|
||||
|
||||
if (pos == R)
|
||||
return ""; // Only use exception list for adverbs
|
||||
|
||||
if (pos == N)
|
||||
{
|
||||
if (ext::ends_with(word, "ful"))
|
||||
{
|
||||
cnt = word.size() - 3;
|
||||
tmpbuf = word.substr(0, cnt);
|
||||
end = "ful";
|
||||
}
|
||||
else
|
||||
{
|
||||
// check for noun ending with 'ss' or short words
|
||||
if (ext::ends_with(word, "ss") || word.size() <= 2)
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
// If not in exception list, try applying rules from tables
|
||||
|
||||
if (tmpbuf.size() == 0)
|
||||
tmpbuf = word;
|
||||
|
||||
if (pos != pos_t::UNKNOWN)
|
||||
{
|
||||
int offset = info.offsets[pos];
|
||||
int pos_cnt = info.cnts[pos];
|
||||
|
||||
std::string morphed;
|
||||
for (int i = 0; i < pos_cnt; i++)
|
||||
{
|
||||
morphed = wordbase(tmpbuf, (i + offset));
|
||||
if (morphed != tmpbuf && is_defined(morphed, pos))
|
||||
return morphed + end;
|
||||
}
|
||||
return morphed;
|
||||
}
|
||||
return word;
|
||||
}
|
||||
|
||||
} // end of namespace wnb
|
113
contrib/wordnet-blast/wnb/core/wordnet.hh
Normal file
113
contrib/wordnet-blast/wnb/core/wordnet.hh
Normal file
@ -0,0 +1,113 @@
|
||||
#pragma once
|
||||
|
||||
# include <iostream>
|
||||
# include <string>
|
||||
# include <cassert>
|
||||
# include <vector>
|
||||
//# include <boost/filesystem.hpp>
|
||||
|
||||
//Possible https://bugs.launchpad.net/ubuntu/+source/boost/+bug/270873
|
||||
# include <boost/graph/graph_traits.hpp>
|
||||
# include <boost/graph/adjacency_list.hpp>
|
||||
|
||||
# include "load_wordnet.hh"
|
||||
# include "pos_t.hh"
|
||||
|
||||
namespace wnb
|
||||
{
|
||||
|
||||
/// More info here: http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html
|
||||
|
||||
struct info_helper;
|
||||
|
||||
/// Synset
|
||||
struct synset
|
||||
{
|
||||
int lex_filenum;
|
||||
std::size_t w_cnt;
|
||||
std::vector<std::string> words;
|
||||
std::vector<int> lex_ids;
|
||||
std::size_t p_cnt;
|
||||
std::string gloss;
|
||||
|
||||
// extra
|
||||
pos_t pos; ///< pos (replace ss_type)
|
||||
int id; ///< unique identifier (replace synset_offset)
|
||||
int sense_number; ///< http://wordnet.princeton.edu/man/senseidx.5WN.html
|
||||
std::vector<std::pair<std::string, int> > tag_cnts; ///< http://wordnet.princeton.edu/man/senseidx.5WN.html
|
||||
|
||||
bool operator==(const synset& s) const { return (id == s.id); }
|
||||
bool operator<(const synset& s) const { return (id < s.id); }
|
||||
};
|
||||
|
||||
|
||||
/// Rel between synsets properties
|
||||
struct ptr
|
||||
{
|
||||
//std::string pointer_symbol; ///< symbol of the relation
|
||||
int pointer_symbol;
|
||||
int source; ///< source word inside synset
|
||||
int target; ///< target word inside synset
|
||||
};
|
||||
|
||||
|
||||
/// Index
|
||||
struct index
|
||||
{
|
||||
std::string lemma;
|
||||
|
||||
std::size_t synset_cnt;
|
||||
std::size_t p_cnt;
|
||||
std::size_t sense_cnt;
|
||||
float tagsense_cnt;
|
||||
std::vector<std::string> ptr_symbols;
|
||||
std::vector<int> synset_offsets;
|
||||
|
||||
// extra
|
||||
std::vector<int> synset_ids;
|
||||
pos_t pos;
|
||||
|
||||
bool operator<(const index& b) const
|
||||
{
|
||||
return (lemma.compare(b.lemma) < 0);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Wordnet interface class
|
||||
struct wordnet
|
||||
{
|
||||
typedef boost::adjacency_list<boost::vecS, boost::vecS,
|
||||
boost::directedS,
|
||||
synset, ptr> graph; ///< boost graph type
|
||||
|
||||
/// Constructor
|
||||
wordnet(const std::string& wordnet_dir, bool verbose=false);
|
||||
|
||||
/// Return synsets matching word
|
||||
std::vector<synset> get_synsets(const std::string& word, pos_t pos = pos_t::UNKNOWN);
|
||||
//FIXME: todo
|
||||
std::vector<synset> get_synset(const std::string& word, char pos, int i);
|
||||
// added
|
||||
const std::vector<std::string> * get_synset(const std::string& word, pos_t pos = pos_t::UNKNOWN) const;
|
||||
|
||||
std::pair<std::vector<index>::iterator, std::vector<index>::iterator>
|
||||
get_indexes(const std::string& word);
|
||||
|
||||
std::pair<std::vector<index>::const_iterator, std::vector<index>::const_iterator>
|
||||
get_indexes_const(const std::string& word) const;
|
||||
|
||||
std::string wordbase(const std::string& word, int ender);
|
||||
|
||||
std::string morphword(const std::string& word, pos_t pos);
|
||||
|
||||
std::vector<index> index_list; ///< index list // FIXME: use a map
|
||||
graph wordnet_graph; ///< synsets graph
|
||||
info_helper info; ///< helper object
|
||||
bool _verbose;
|
||||
|
||||
typedef std::map<std::string,std::string> exc_t;
|
||||
std::map<pos_t, exc_t> exc;
|
||||
};
|
||||
|
||||
} // end of namespace wnb
|
180
contrib/wordnet-blast/wnb/main.cc
Normal file
180
contrib/wordnet-blast/wnb/main.cc
Normal file
@ -0,0 +1,180 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include <boost/progress.hpp>
|
||||
#include <boost/algorithm/string.hpp>
|
||||
|
||||
#include <wnb/core/wordnet.hh>
|
||||
#include <wnb/core/load_wordnet.hh>
|
||||
#include <wnb/core/info_helper.hh>
|
||||
#include <wnb/nltk_similarity.hh>
|
||||
#include <wnb/std_ext.hh>
|
||||
|
||||
using namespace wnb;
|
||||
using namespace boost;
|
||||
using namespace boost::algorithm;
|
||||
|
||||
bool usage(int argc, char ** argv)
|
||||
{
|
||||
std::string dir;
|
||||
if (argc >= 2)
|
||||
dir = std::string(argv[1]);
|
||||
if (argc != 3 || dir[dir.length()-1] != '/')
|
||||
{
|
||||
std::cout << argv[0] << " .../wordnet_dir/ word_list_file" << std::endl;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
struct ws
|
||||
{
|
||||
std::string w;
|
||||
float s;
|
||||
|
||||
bool operator<(const ws& a) const {return s > a.s;}
|
||||
};
|
||||
|
||||
|
||||
/// Compute similarity of word with words in word list
|
||||
std::vector<ws>
|
||||
compute_similarities(wordnet& wn,
|
||||
const std::string& word,
|
||||
const std::vector<std::string>& word_list)
|
||||
{
|
||||
std::vector<ws> wslist;
|
||||
std::vector<synset> synsets1 = wn.get_synsets(word);
|
||||
|
||||
for (unsigned i = 0; i < synsets1.size(); i++)
|
||||
for (unsigned k = 0; k < synsets1[i].words.size(); k++)
|
||||
std::cout << " - " << synsets1[i].words[k] << std::endl;
|
||||
|
||||
nltk_similarity path_similarity(wn);
|
||||
{
|
||||
progress_timer t;
|
||||
progress_display show_progress(word_list.size());
|
||||
|
||||
for (unsigned k = 0; k < word_list.size(); k++)
|
||||
{
|
||||
const std::string& w = word_list[k];
|
||||
float max = 0;
|
||||
std::vector<synset> synsets2 = wn.get_synsets(w);
|
||||
for (unsigned i = 0; i < synsets1.size(); i++)
|
||||
{
|
||||
for (unsigned j = 0; j < synsets2.size(); j++)
|
||||
{
|
||||
float s = path_similarity(synsets1[i], synsets2[j], 6);
|
||||
if (s > max)
|
||||
max = s;
|
||||
}
|
||||
}
|
||||
ws e = {w, max};
|
||||
wslist.push_back(e);
|
||||
++show_progress;
|
||||
}
|
||||
}
|
||||
|
||||
return wslist;
|
||||
}
|
||||
|
||||
void similarity_test(wordnet& wn,
|
||||
const std::string& word,
|
||||
std::vector<std::string>& word_list)
|
||||
{
|
||||
std::vector<ws> wslist = compute_similarities(wn, word, word_list);
|
||||
|
||||
std::stable_sort(wslist.begin(), wslist.end());
|
||||
for (unsigned i = 0; i < std::min(wslist.size(), size_t(10)); i++)
|
||||
std::cout << wslist[i].w << " " << wslist[i].s << std::endl;
|
||||
}
|
||||
|
||||
void print_synsets(pos_t pos, wnb::index& idx, wordnet& wn)
|
||||
{
|
||||
std::string& mword = idx.lemma;
|
||||
std::cout << "\nOverview of " << get_name_from_pos(pos) << " " << mword << "\n\n";
|
||||
std::cout << "The " << get_name_from_pos(pos) << " " << mword << " has "
|
||||
<< idx.synset_ids.size() << ((idx.synset_ids.size() == 1) ? " sense": " senses");
|
||||
|
||||
if (idx.tagsense_cnt != 0)
|
||||
std::cout << " (first " << idx.tagsense_cnt << " from tagged texts)";
|
||||
else
|
||||
std::cout << " (no senses from tagged texts)";
|
||||
|
||||
std::cout << "\n";
|
||||
std::cout << " \n";
|
||||
|
||||
for (std::size_t i = 0; i < idx.synset_ids.size(); i++)
|
||||
{
|
||||
int id = idx.synset_ids[i];
|
||||
const synset& synset = wn.wordnet_graph[id];
|
||||
|
||||
std::cout << i+1 << ". ";
|
||||
for (std::size_t k = 0; k < synset.tag_cnts.size(); k++)
|
||||
{
|
||||
if (synset.tag_cnts[k].first == mword)
|
||||
std::cout << "(" << synset.tag_cnts[k].second << ") ";
|
||||
}
|
||||
|
||||
std::vector<std::string> nwords;
|
||||
for (auto& w : synset.words)
|
||||
nwords.push_back((pos == A) ? w.substr(0, w.find_first_of("(")) : w);
|
||||
|
||||
std::cout << replace_all_copy(join(nwords, ", "), "_", " ");
|
||||
std::cout << " -- (" << trim_copy(synset.gloss) << ")";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void wn_like(wordnet& wn, const std::string& word, pos_t pos)
|
||||
{
|
||||
if (word == "")
|
||||
return;
|
||||
|
||||
typedef std::vector<wnb::index> vi;
|
||||
std::pair<vi::iterator,vi::iterator> bounds = wn.get_indexes(word);
|
||||
|
||||
for (vi::iterator it = bounds.first; it != bounds.second; it++)
|
||||
{
|
||||
if (pos != -1 && it->pos == pos)
|
||||
{
|
||||
print_synsets(pos, *it, wn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void batch_test(wordnet& wn, std::vector<std::string>& word_list)
|
||||
{
|
||||
for (std::size_t i = 0; i < word_list.size(); i++)
|
||||
{
|
||||
for (unsigned p = 1; p < POS_ARRAY_SIZE; p++)
|
||||
{
|
||||
pos_t pos = (pos_t) p;
|
||||
|
||||
wn_like(wn, word_list[i], pos);
|
||||
std::string mword = wn.morphword(word_list[i], pos);
|
||||
if (mword != word_list[i])
|
||||
wn_like(wn, mword, pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
if (usage(argc, argv))
|
||||
return 1;
|
||||
|
||||
// read command line
|
||||
std::string wordnet_dir = argv[1];
|
||||
std::string test_file = argv[2];
|
||||
|
||||
wordnet wn(wordnet_dir);
|
||||
|
||||
// read test file
|
||||
std::string list = ext::read_file(test_file);
|
||||
std::vector<std::string> wl = ext::split(list);
|
||||
|
||||
batch_test(wn, wl);
|
||||
}
|
||||
|
146
contrib/wordnet-blast/wnb/nltk_similarity.hh
Normal file
146
contrib/wordnet-blast/wnb/nltk_similarity.hh
Normal file
@ -0,0 +1,146 @@
|
||||
#ifndef _NLTK_SIMILARITY_HH
|
||||
# define _NLTK_SIMILARITY_HH
|
||||
|
||||
# include <queue>
|
||||
# include <boost/graph/filtered_graph.hpp>
|
||||
# include <wnb/core/wordnet.hh>
|
||||
|
||||
namespace wnb
|
||||
{
|
||||
namespace internal
|
||||
{
|
||||
|
||||
//Helper class filtering out other than hypernym relations
|
||||
template <typename PointerSymbolMap>
|
||||
struct hyper_edge
|
||||
{
|
||||
hyper_edge() { }
|
||||
|
||||
hyper_edge(PointerSymbolMap pointer_symbol)
|
||||
: m_pointer_symbol(pointer_symbol) { }
|
||||
|
||||
template <typename Edge>
|
||||
bool operator()(const Edge& e) const
|
||||
{
|
||||
int p_s = get(m_pointer_symbol, e);
|
||||
return p_s == 1; // hypernyme (instance_hypernyme not used here)
|
||||
}
|
||||
|
||||
PointerSymbolMap m_pointer_symbol;
|
||||
};
|
||||
|
||||
} // end of anonymous namespace
|
||||
|
||||
|
||||
class nltk_similarity
|
||||
{
|
||||
|
||||
typedef boost::property_map<wordnet::graph,
|
||||
int ptr::*>::type PointerSymbolMap;
|
||||
typedef boost::filtered_graph<wordnet::graph,
|
||||
internal::hyper_edge<PointerSymbolMap> > G;
|
||||
typedef boost::graph_traits<G>::vertex_descriptor vertex;
|
||||
|
||||
internal::hyper_edge<PointerSymbolMap> filter;
|
||||
G fg;
|
||||
|
||||
public:
|
||||
|
||||
nltk_similarity(wordnet& wn)
|
||||
: filter(get(&ptr::pointer_symbol, wn.wordnet_graph)),
|
||||
fg(wn.wordnet_graph, filter)
|
||||
{ }
|
||||
|
||||
/// Get list of hypernyms of s along with distance to s
|
||||
std::map<vertex, int> hypernym_map(vertex s);
|
||||
|
||||
/// Get shortest path between and synset1 and synset2.
|
||||
int shortest_path_distance(const synset& synset1, const synset& synset2);
|
||||
|
||||
/// return disance
|
||||
float operator()(const synset& synset1, const synset& synset2, int=0);
|
||||
|
||||
};
|
||||
|
||||
std::map<nltk_similarity::vertex, int>
|
||||
nltk_similarity::hypernym_map(nltk_similarity::vertex s)
|
||||
{
|
||||
std::map<vertex, int> map;
|
||||
|
||||
// Python:
|
||||
// for (hypernym in self[HYPERNYM])
|
||||
// distances |= hypernym.hypernym_distances(distance+1);
|
||||
|
||||
boost::graph_traits<G>::out_edge_iterator e, e_end;
|
||||
std::queue<vertex> q;
|
||||
|
||||
q.push(s);
|
||||
map[s] = 0;
|
||||
while (!q.empty())
|
||||
{
|
||||
vertex u = q.front(); q.pop();
|
||||
|
||||
int new_d = map[u] + 1;
|
||||
for (boost::tuples::tie(e, e_end) = out_edges(u, fg); e != e_end; ++e)
|
||||
{
|
||||
vertex v = target(*e,fg);
|
||||
q.push(v);
|
||||
|
||||
if (map.find(v) != map.end())
|
||||
{
|
||||
if (new_d < map[v])
|
||||
map[v] = new_d;
|
||||
else
|
||||
q.pop();
|
||||
}
|
||||
else
|
||||
map[v] = new_d;
|
||||
}
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
nltk_similarity::shortest_path_distance(const synset& synset1, const synset& synset2)
|
||||
{
|
||||
vertex v1 = synset1.id;
|
||||
vertex v2 = synset2.id;
|
||||
|
||||
std::map<vertex, int> map1 = hypernym_map(v1);
|
||||
std::map<vertex, int> map2 = hypernym_map(v2);
|
||||
|
||||
// For each ancestor synset common to both subject synsets, find the
|
||||
// connecting path length. Return the shortest of these.
|
||||
|
||||
int path_distance = -1;
|
||||
std::map<vertex, int>::iterator it, it2;
|
||||
for (it = map1.begin(); it != map1.end(); it++)
|
||||
for (it2 = map2.begin(); it2 != map2.end(); it2++)
|
||||
if (fg[it->first] == fg[it2->first])
|
||||
{
|
||||
int new_distance = it->second + it2->second;
|
||||
if (path_distance < 0 || new_distance < path_distance)
|
||||
path_distance = new_distance;
|
||||
}
|
||||
|
||||
return path_distance;
|
||||
}
|
||||
|
||||
|
||||
float
|
||||
nltk_similarity::operator()(const synset& synset1, const synset& synset2, int)
|
||||
{
|
||||
int distance = shortest_path_distance(synset1, synset2);
|
||||
if (distance >= 0)
|
||||
return 1. / (distance + 1);
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
} // end of namespace wnb
|
||||
|
||||
#endif /* _NLTK_SIMILARITY_HH */
|
||||
|
90
contrib/wordnet-blast/wnb/std_ext.hh
Normal file
90
contrib/wordnet-blast/wnb/std_ext.hh
Normal file
@ -0,0 +1,90 @@
|
||||
#ifndef _STD_EXT_HH
|
||||
# define _STD_EXT_HH
|
||||
|
||||
# include <string>
|
||||
# include <sstream>
|
||||
# include <fstream>
|
||||
# include <algorithm>
|
||||
# include <stdexcept>
|
||||
|
||||
namespace ext
|
||||
{
|
||||
/// Read a file, return the content as a C++ string
|
||||
inline
|
||||
std::string read_file(const std::string& fn)
|
||||
{
|
||||
std::ifstream is;
|
||||
is.open(fn.c_str(), std::ios::binary);
|
||||
if (!is.is_open())
|
||||
throw std::runtime_error("File not found: " + fn);
|
||||
|
||||
std::string str((std::istreambuf_iterator<char>(is)),
|
||||
std::istreambuf_iterator<char>());
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
/// Split a std::string
|
||||
inline
|
||||
std::vector<std::string> split(const std::string& str)
|
||||
{
|
||||
std::vector<std::string> tokens;
|
||||
std::istringstream iss(str);
|
||||
copy(std::istream_iterator<std::string>(iss),
|
||||
std::istream_iterator<std::string>(),
|
||||
std::back_inserter< std::vector<std::string> >(tokens));
|
||||
return tokens;
|
||||
}
|
||||
|
||||
/// Split a std::string on separator
|
||||
inline
|
||||
std::vector<std::string> split(const std::string& s, char seperator)
|
||||
{
|
||||
std::vector<std::string> output;
|
||||
std::string::size_type prev_pos = 0, pos = 0;
|
||||
|
||||
while((pos = s.find(seperator, pos)) != std::string::npos)
|
||||
{
|
||||
std::string substring( s.substr(prev_pos, pos-prev_pos) );
|
||||
output.push_back(substring);
|
||||
prev_pos = ++pos;
|
||||
}
|
||||
|
||||
output.push_back(s.substr(prev_pos, pos-prev_pos));
|
||||
return output;
|
||||
}
|
||||
|
||||
inline
|
||||
bool
|
||||
ends_with(const std::string& str, const std::string& ending)
|
||||
{
|
||||
if (str.length() >= ending.length())
|
||||
{
|
||||
int cmp = str.compare(str.length() - ending.length(),
|
||||
ending.length(), ending);
|
||||
return (0 == cmp);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/// Sorted unique
|
||||
template <typename T>
|
||||
inline
|
||||
T s_unique(T& v)
|
||||
{
|
||||
T out;
|
||||
|
||||
std::sort(v.begin(), v.end());
|
||||
typename T::iterator last = std::unique(v.begin(),v.end());
|
||||
|
||||
out.resize(last - v.begin());
|
||||
std::copy(v.begin(), last, out.begin());
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
} // end of ext
|
||||
|
||||
#endif /* _STD_EXT_HH */
|
||||
|
@ -124,3 +124,4 @@ endif()
|
||||
set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-sanitize=signed-integer-overflow)
|
||||
|
||||
target_link_libraries(clickhouse_functions PRIVATE stemmer)
|
||||
target_link_libraries(clickhouse_functions PRIVATE wnb)
|
@ -1,10 +1,11 @@
|
||||
#include <Common/Exception.h>
|
||||
#include <Interpreters/SynonymsExtensions.h>
|
||||
#include <Functions/SynonymsExtensions.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <list>
|
||||
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <wnb/core/wordnet.hh>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -48,7 +49,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
Synset * getSynonyms(const std::string_view & token) const override
|
||||
const Synset * getSynonyms(const std::string_view & token) const override
|
||||
{
|
||||
auto it = table.find(token);
|
||||
|
||||
@ -62,20 +63,23 @@ public:
|
||||
class WordnetSynonymsExtension : public ISynonymsExtension
|
||||
{
|
||||
private:
|
||||
// std::vector<std::vector<String>> data;
|
||||
wnb::wordnet wn;
|
||||
|
||||
public:
|
||||
WordnetSynonymsExtension(const String & /*path*/)
|
||||
{
|
||||
WordnetSynonymsExtension(const String & path) : wn(path) {}
|
||||
|
||||
}
|
||||
|
||||
Synset * getSynonyms(const std::string_view & /*token*/) const override
|
||||
const Synset * getSynonyms(const std::string_view & token) const override
|
||||
{
|
||||
return nullptr;
|
||||
return wn.get_synset(std::string(token));
|
||||
}
|
||||
};
|
||||
|
||||
/// Duplicate of code from StringUtils.h. Copied here for less dependencies.
|
||||
static bool startsWith(const std::string & s, const char * prefix)
|
||||
{
|
||||
return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix));
|
||||
}
|
||||
|
||||
SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration & config)
|
||||
{
|
||||
String prefix = "synonyms_extensions";
|
||||
@ -89,7 +93,7 @@ SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration &
|
||||
|
||||
for (const auto & key : keys)
|
||||
{
|
||||
if (key == "extension")
|
||||
if (startsWith(key, "extension"))
|
||||
{
|
||||
const auto & ext_name = config.getString(prefix + "." + key + ".name", "");
|
||||
const auto & ext_path = config.getString(prefix + "." + key + ".path", "");
|
@ -19,7 +19,7 @@ public:
|
||||
|
||||
//ISynonymsExtension(const String & path);
|
||||
|
||||
virtual Synset * getSynonyms(const std::string_view & token) const = 0;
|
||||
virtual const Synset * getSynonyms(const std::string_view & token) const = 0;
|
||||
|
||||
virtual ~ISynonymsExtension() = default;
|
||||
};
|
@ -6,7 +6,7 @@
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Functions/IFunction.h>
|
||||
#include <Interpreters/SynonymsExtensions.h>
|
||||
#include <Functions/SynonymsExtensions.h>
|
||||
#include <Interpreters/Context.h>
|
||||
|
||||
#include <string_view>
|
||||
|
@ -53,7 +53,6 @@
|
||||
#include <Interpreters/InterserverCredentials.h>
|
||||
#include <Interpreters/Cluster.h>
|
||||
#include <Interpreters/InterserverIOHandler.h>
|
||||
#include <Interpreters/SynonymsExtensions.h>
|
||||
#include <Interpreters/SystemLog.h>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <Interpreters/DDLWorker.h>
|
||||
@ -75,7 +74,7 @@
|
||||
#include <Interpreters/DatabaseCatalog.h>
|
||||
#include <Storages/MergeTree/BackgroundJobsExecutor.h>
|
||||
#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
|
||||
|
||||
#include <Functions/SynonymsExtensions.h>
|
||||
|
||||
namespace ProfileEvents
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user