mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
added WordNet synonyms extensions
This commit is contained in:
parent
876f51ab95
commit
ed12fb5604
1
contrib/CMakeLists.txt
vendored
1
contrib/CMakeLists.txt
vendored
@ -331,3 +331,4 @@ endif()
|
|||||||
|
|
||||||
add_subdirectory(fast_float)
|
add_subdirectory(fast_float)
|
||||||
add_subdirectory(libstemmer-c-cmake)
|
add_subdirectory(libstemmer-c-cmake)
|
||||||
|
add_subdirectory(wordnet-blast-cmake)
|
||||||
|
@ -13,11 +13,12 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
|
|||||||
regex
|
regex
|
||||||
context
|
context
|
||||||
coroutine
|
coroutine
|
||||||
|
graph
|
||||||
)
|
)
|
||||||
|
|
||||||
if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND
|
if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND
|
||||||
Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY AND
|
Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY AND
|
||||||
Boost_COROUTINE_LIBRARY)
|
Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY)
|
||||||
|
|
||||||
set(EXTERNAL_BOOST_FOUND 1)
|
set(EXTERNAL_BOOST_FOUND 1)
|
||||||
|
|
||||||
@ -32,6 +33,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
|
|||||||
add_library (_boost_system INTERFACE)
|
add_library (_boost_system INTERFACE)
|
||||||
add_library (_boost_context INTERFACE)
|
add_library (_boost_context INTERFACE)
|
||||||
add_library (_boost_coroutine INTERFACE)
|
add_library (_boost_coroutine INTERFACE)
|
||||||
|
add_library (_boost_graph INTERFACE)
|
||||||
|
|
||||||
target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY})
|
target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY})
|
||||||
target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY})
|
target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY})
|
||||||
@ -40,6 +42,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
|
|||||||
target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY})
|
target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY})
|
||||||
target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY})
|
target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY})
|
||||||
target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY})
|
target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY})
|
||||||
|
target_link_libraries (_boost_graph INTERFACE ${Boost_GRAPH_LIBRARY})
|
||||||
|
|
||||||
add_library (boost::filesystem ALIAS _boost_filesystem)
|
add_library (boost::filesystem ALIAS _boost_filesystem)
|
||||||
add_library (boost::iostreams ALIAS _boost_iostreams)
|
add_library (boost::iostreams ALIAS _boost_iostreams)
|
||||||
@ -48,6 +51,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
|
|||||||
add_library (boost::system ALIAS _boost_system)
|
add_library (boost::system ALIAS _boost_system)
|
||||||
add_library (boost::context ALIAS _boost_context)
|
add_library (boost::context ALIAS _boost_context)
|
||||||
add_library (boost::coroutine ALIAS _boost_coroutine)
|
add_library (boost::coroutine ALIAS _boost_coroutine)
|
||||||
|
add_library (boost::graph ALIAS _boost_graph)
|
||||||
else()
|
else()
|
||||||
set(EXTERNAL_BOOST_FOUND 0)
|
set(EXTERNAL_BOOST_FOUND 0)
|
||||||
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost")
|
message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost")
|
||||||
@ -221,4 +225,16 @@ if (NOT EXTERNAL_BOOST_FOUND)
|
|||||||
add_library (boost::coroutine ALIAS _boost_coroutine)
|
add_library (boost::coroutine ALIAS _boost_coroutine)
|
||||||
target_include_directories (_boost_coroutine PRIVATE ${LIBRARY_DIR})
|
target_include_directories (_boost_coroutine PRIVATE ${LIBRARY_DIR})
|
||||||
target_link_libraries(_boost_coroutine PRIVATE _boost_context)
|
target_link_libraries(_boost_coroutine PRIVATE _boost_context)
|
||||||
|
|
||||||
|
# graph
|
||||||
|
|
||||||
|
set (SRCS_GRAPH
|
||||||
|
"${LIBRARY_DIR}/libs/graph/src/graphml.cpp"
|
||||||
|
"${LIBRARY_DIR}/libs/graph/src/read_graphviz_new.cpp"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_library (_boost_graph ${SRCS_GRAPH})
|
||||||
|
add_library (boost::graph ALIAS _boost_graph)
|
||||||
|
target_include_directories (_boost_graph PRIVATE ${LIBRARY_DIR})
|
||||||
|
|
||||||
endif ()
|
endif ()
|
||||||
|
13
contrib/wordnet-blast-cmake/CMakeLists.txt
Normal file
13
contrib/wordnet-blast-cmake/CMakeLists.txt
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast")
|
||||||
|
|
||||||
|
set(SRCS
|
||||||
|
"${LIBRARY_DIR}/wnb/core/info_helper.cc"
|
||||||
|
"${LIBRARY_DIR}/wnb/core/load_wordnet.cc"
|
||||||
|
"${LIBRARY_DIR}/wnb/core/wordnet.cc"
|
||||||
|
)
|
||||||
|
|
||||||
|
add_library(wnb ${SRCS})
|
||||||
|
|
||||||
|
target_link_libraries(wnb PRIVATE boost::headers_only boost::graph)
|
||||||
|
|
||||||
|
target_include_directories(wnb PUBLIC "${LIBRARY_DIR}")
|
1
contrib/wordnet-blast/AUTHORS
Normal file
1
contrib/wordnet-blast/AUTHORS
Normal file
@ -0,0 +1 @@
|
|||||||
|
Ugo Jardonnet ugo.jardonnet/gmail
|
65
contrib/wordnet-blast/CMakeLists.txt
Normal file
65
contrib/wordnet-blast/CMakeLists.txt
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
|
||||||
|
|
||||||
|
PROJECT(wnb)
|
||||||
|
|
||||||
|
# Boost dependency
|
||||||
|
#--------------------------------------------------
|
||||||
|
|
||||||
|
# IF (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
||||||
|
# SET (BOOST_ROOT /Developer/boost_build/) # Suggested path
|
||||||
|
# ELSE()
|
||||||
|
# SET (BOOST_ROOT "/usr/include")
|
||||||
|
# ENDIF()
|
||||||
|
##############
|
||||||
|
SET (BOOST_ROOT "${ClickHouse_SOURCE_DIR}/contrib/boost")
|
||||||
|
##############
|
||||||
|
MESSAGE(STATUS "** Search Boost root: ${BOOST_ROOT}")
|
||||||
|
FIND_PACKAGE(Boost 1.70.0 COMPONENTS graph REQUIRED)
|
||||||
|
MESSAGE(STATUS "** Boost Include: ${Boost_INCLUDE_DIR}")
|
||||||
|
MESSAGE(STATUS "** Boost Libraries: ${Boost_LIBRARY_DIRS}")
|
||||||
|
MESSAGE(STATUS "** Boost Libraries: ${Boost_LIBRARIES}")
|
||||||
|
|
||||||
|
INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIR})
|
||||||
|
LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
|
||||||
|
|
||||||
|
# Project
|
||||||
|
#--------------------------------------------------
|
||||||
|
|
||||||
|
LINK_DIRECTORIES(${wnb_SOURCE_DIR}/lib)
|
||||||
|
INCLUDE_DIRECTORIES(${PROJECT_BINARY_DIR})
|
||||||
|
|
||||||
|
SET(PROJECT_VERSION "0.6")
|
||||||
|
SET(ARCHIVE_NAME ${CMAKE_PROJECT_NAME}-${PROJECT_VERSION})
|
||||||
|
|
||||||
|
ADD_CUSTOM_TARGET(dist
|
||||||
|
COMMAND git archive --prefix=${ARCHIVE_NAME}/ HEAD
|
||||||
|
| bzip2 > ${CMAKE_BINARY_DIR}/${ARCHIVE_NAME}.tar.bz2
|
||||||
|
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
|
||||||
|
|
||||||
|
ADD_CUSTOM_TARGET(check
|
||||||
|
COMMAND ./check/check.sh ./check/list.txt)
|
||||||
|
|
||||||
|
|
||||||
|
## Compiler flags
|
||||||
|
IF (CMAKE_COMPILER_IS_GNUCXX)
|
||||||
|
list(APPEND CMAKE_CXX_FLAGS " --std=c++11 -O3 -DNDEBUG -Wall -Wextra")
|
||||||
|
#list(APPEND CMAKE_CXX_FLAGS " -g -Wall -Wextra")
|
||||||
|
ENDIF()
|
||||||
|
|
||||||
|
SET(WNB_SRCS wnb/core/wordnet.cc
|
||||||
|
wnb/core/load_wordnet.cc wnb/core/info_helper.cc)
|
||||||
|
|
||||||
|
# Executable
|
||||||
|
#--------------------------------------------------
|
||||||
|
ADD_EXECUTABLE (wntest wnb/main.cc ${WNB_SRCS})
|
||||||
|
SET(EXECUTABLE_OUTPUT_PATH ${wnb_BINARY_DIR}/bin)
|
||||||
|
|
||||||
|
# Static library
|
||||||
|
#--------------------------------------------------
|
||||||
|
ADD_LIBRARY(wnb ${WNB_SRCS})
|
||||||
|
SET(LIBRARY_OUTPUT_PATH ${wnb_BINARY_DIR}/lib)
|
||||||
|
|
||||||
|
IF (Boost_FOUND)
|
||||||
|
TARGET_LINK_LIBRARIES(wntest ${Boost_LIBRARIES})
|
||||||
|
TARGET_LINK_LIBRARIES(wnb ${Boost_LIBRARIES})
|
||||||
|
ENDIF()
|
43
contrib/wordnet-blast/README
Normal file
43
contrib/wordnet-blast/README
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
|
||||||
|
=====================================================================
|
||||||
|
WordNet Blast
|
||||||
|
=====================================================================
|
||||||
|
|
||||||
|
In memory access to the wordnet onthology.
|
||||||
|
|
||||||
|
DEPENDENCIES:
|
||||||
|
boost 1.46
|
||||||
|
wordnet-sense-index
|
||||||
|
colordiff (for wntest)
|
||||||
|
|
||||||
|
INSTALL:
|
||||||
|
cmake CMakeLists.txt
|
||||||
|
make
|
||||||
|
|
||||||
|
TESTS: (Beta)
|
||||||
|
make check
|
||||||
|
|
||||||
|
USAGE:
|
||||||
|
#include "wordnet.hh"
|
||||||
|
#include "wnb/nltk_similarity.hh"
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace wnb;
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
wordnet wn(PATH_TO_WORDNET);
|
||||||
|
|
||||||
|
vector<synset> synsets1 = wn.get_synsets("cat");
|
||||||
|
vector<synset> synsets2 = wn.get_synsets("dog");
|
||||||
|
|
||||||
|
nltk_similarity similarity(wn);
|
||||||
|
float d = similarity(synsets1[0], synsets2[0], 6);
|
||||||
|
}
|
||||||
|
|
||||||
|
BUGS:
|
||||||
|
- Word Morphing is sometimes incorrect.
|
||||||
|
|
||||||
|
REFERENCE:
|
||||||
|
George A. Miller (1995). WordNet: A Lexical Database for English.
|
||||||
|
Communications of the ACM Vol. 38, No. 11: 39-41.
|
25
contrib/wordnet-blast/WORDNET_LICENSE
Normal file
25
contrib/wordnet-blast/WORDNET_LICENSE
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
This license is available as the file LICENSE in any downloaded version of
|
||||||
|
WordNet.
|
||||||
|
|
||||||
|
WordNet Release 3.0
|
||||||
|
|
||||||
|
This software and database is being provided to you, the LICENSEE, by Princeton
|
||||||
|
University under the following license. By obtaining, using and/or copying this
|
||||||
|
software and database, you agree that you have read, understood, and will comply
|
||||||
|
with these terms and conditions.: Permission to use, copy, modify and distribute
|
||||||
|
this software and database and its documentation for any purpose and without fee
|
||||||
|
or royalty is hereby granted, provided that you agree to comply with the
|
||||||
|
following copyright notice and statements, including the disclaimer, and that
|
||||||
|
the same appear on ALL copies of the software, database and documentation,
|
||||||
|
including modifications that you make for internal use or for distribution.
|
||||||
|
WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved. THIS
|
||||||
|
SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO
|
||||||
|
REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT
|
||||||
|
LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF
|
||||||
|
MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE
|
||||||
|
LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY
|
||||||
|
PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton
|
||||||
|
University or Princeton may not be used in advertising or publicity pertaining
|
||||||
|
to distribution of the software and/or database. Title to copyright in this
|
||||||
|
software, database and any associated documentation shall at all times remain
|
||||||
|
with Princeton University and LICENSEE agrees to preserve same.
|
11
contrib/wordnet-blast/changelog
Normal file
11
contrib/wordnet-blast/changelog
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
* 0.6
|
||||||
|
- Improve tests
|
||||||
|
- get_synsets by pos
|
||||||
|
- Load wordnet a bit faster
|
||||||
|
- Fix build on Mac Os (thanks to Roman Kutlak)
|
||||||
|
- Update doc
|
||||||
|
- Improve testing
|
||||||
|
* 0.5
|
||||||
|
- get_synsets
|
||||||
|
with morphing partially implemented (thanks to Yaron Feigin)
|
||||||
|
- sense similarity
|
852
contrib/wordnet-blast/check/biglist.txt
Normal file
852
contrib/wordnet-blast/check/biglist.txt
Normal file
@ -0,0 +1,852 @@
|
|||||||
|
a
|
||||||
|
able
|
||||||
|
about
|
||||||
|
account
|
||||||
|
acid
|
||||||
|
across
|
||||||
|
act
|
||||||
|
addition
|
||||||
|
adjustment
|
||||||
|
advertisement
|
||||||
|
after
|
||||||
|
again
|
||||||
|
against
|
||||||
|
agreement
|
||||||
|
air
|
||||||
|
all
|
||||||
|
almost
|
||||||
|
among
|
||||||
|
amount
|
||||||
|
amusement
|
||||||
|
and
|
||||||
|
angle
|
||||||
|
angry
|
||||||
|
animal
|
||||||
|
answer
|
||||||
|
ant
|
||||||
|
any
|
||||||
|
apparatus
|
||||||
|
apple
|
||||||
|
approval
|
||||||
|
arch
|
||||||
|
argument
|
||||||
|
arm
|
||||||
|
army
|
||||||
|
art
|
||||||
|
as
|
||||||
|
at
|
||||||
|
attack
|
||||||
|
attempt
|
||||||
|
attention
|
||||||
|
attraction
|
||||||
|
authority
|
||||||
|
automatic
|
||||||
|
awake
|
||||||
|
baby
|
||||||
|
back
|
||||||
|
bad
|
||||||
|
bag
|
||||||
|
balance
|
||||||
|
ball
|
||||||
|
band
|
||||||
|
base
|
||||||
|
basin
|
||||||
|
basket
|
||||||
|
bath
|
||||||
|
be
|
||||||
|
beautiful
|
||||||
|
because
|
||||||
|
bed
|
||||||
|
bee
|
||||||
|
before
|
||||||
|
behaviour
|
||||||
|
belief
|
||||||
|
bell
|
||||||
|
bent
|
||||||
|
berry
|
||||||
|
between
|
||||||
|
bird
|
||||||
|
birth
|
||||||
|
bit
|
||||||
|
bite
|
||||||
|
bitter
|
||||||
|
black
|
||||||
|
blade
|
||||||
|
blood
|
||||||
|
blow
|
||||||
|
blue
|
||||||
|
board
|
||||||
|
boat
|
||||||
|
body
|
||||||
|
boiling
|
||||||
|
bone
|
||||||
|
book
|
||||||
|
boot
|
||||||
|
bottle
|
||||||
|
box
|
||||||
|
boy
|
||||||
|
brain
|
||||||
|
brake
|
||||||
|
branch
|
||||||
|
brass
|
||||||
|
bread
|
||||||
|
breath
|
||||||
|
brick
|
||||||
|
bridge
|
||||||
|
bright
|
||||||
|
broken
|
||||||
|
brother
|
||||||
|
brown
|
||||||
|
brush
|
||||||
|
bucket
|
||||||
|
building
|
||||||
|
bulb
|
||||||
|
burn
|
||||||
|
burst
|
||||||
|
business
|
||||||
|
but
|
||||||
|
butter
|
||||||
|
button
|
||||||
|
by
|
||||||
|
cake
|
||||||
|
camera
|
||||||
|
canvas
|
||||||
|
card
|
||||||
|
care
|
||||||
|
carriage
|
||||||
|
cart
|
||||||
|
cat
|
||||||
|
cause
|
||||||
|
certain
|
||||||
|
chain
|
||||||
|
chalk
|
||||||
|
chance
|
||||||
|
change
|
||||||
|
cheap
|
||||||
|
cheese
|
||||||
|
chemical
|
||||||
|
chest
|
||||||
|
chief
|
||||||
|
chin
|
||||||
|
church
|
||||||
|
circle
|
||||||
|
clean
|
||||||
|
clear
|
||||||
|
clock
|
||||||
|
cloth
|
||||||
|
cloud
|
||||||
|
coal
|
||||||
|
coat
|
||||||
|
cold
|
||||||
|
collar
|
||||||
|
colour
|
||||||
|
comb
|
||||||
|
come
|
||||||
|
comfort
|
||||||
|
committee
|
||||||
|
common
|
||||||
|
company
|
||||||
|
comparison
|
||||||
|
competition
|
||||||
|
complete
|
||||||
|
complex
|
||||||
|
condition
|
||||||
|
connection
|
||||||
|
conscious
|
||||||
|
control
|
||||||
|
cook
|
||||||
|
copper
|
||||||
|
copy
|
||||||
|
cord
|
||||||
|
cork
|
||||||
|
cotton
|
||||||
|
cough
|
||||||
|
country
|
||||||
|
cover
|
||||||
|
cow
|
||||||
|
crack
|
||||||
|
credit
|
||||||
|
crime
|
||||||
|
cruel
|
||||||
|
crush
|
||||||
|
cry
|
||||||
|
cup
|
||||||
|
cup
|
||||||
|
current
|
||||||
|
curtain
|
||||||
|
curve
|
||||||
|
cushion
|
||||||
|
damage
|
||||||
|
danger
|
||||||
|
dark
|
||||||
|
daughter
|
||||||
|
day
|
||||||
|
dead
|
||||||
|
dear
|
||||||
|
death
|
||||||
|
debt
|
||||||
|
decision
|
||||||
|
deep
|
||||||
|
degree
|
||||||
|
delicate
|
||||||
|
dependent
|
||||||
|
design
|
||||||
|
desire
|
||||||
|
destruction
|
||||||
|
detail
|
||||||
|
development
|
||||||
|
different
|
||||||
|
digestion
|
||||||
|
direction
|
||||||
|
dirty
|
||||||
|
discovery
|
||||||
|
discussion
|
||||||
|
disease
|
||||||
|
disgust
|
||||||
|
distance
|
||||||
|
distribution
|
||||||
|
division
|
||||||
|
do
|
||||||
|
dog
|
||||||
|
door
|
||||||
|
doubt
|
||||||
|
down
|
||||||
|
drain
|
||||||
|
drawer
|
||||||
|
dress
|
||||||
|
drink
|
||||||
|
driving
|
||||||
|
drop
|
||||||
|
dry
|
||||||
|
dust
|
||||||
|
ear
|
||||||
|
early
|
||||||
|
earth
|
||||||
|
east
|
||||||
|
edge
|
||||||
|
education
|
||||||
|
effect
|
||||||
|
egg
|
||||||
|
elastic
|
||||||
|
electric
|
||||||
|
end
|
||||||
|
engine
|
||||||
|
enough
|
||||||
|
equal
|
||||||
|
error
|
||||||
|
even
|
||||||
|
event
|
||||||
|
ever
|
||||||
|
every
|
||||||
|
example
|
||||||
|
exchange
|
||||||
|
existence
|
||||||
|
expansion
|
||||||
|
experience
|
||||||
|
expert
|
||||||
|
eye
|
||||||
|
face
|
||||||
|
fact
|
||||||
|
fall
|
||||||
|
false
|
||||||
|
family
|
||||||
|
far
|
||||||
|
farm
|
||||||
|
fat
|
||||||
|
father
|
||||||
|
fear
|
||||||
|
feather
|
||||||
|
feeble
|
||||||
|
feeling
|
||||||
|
female
|
||||||
|
fertile
|
||||||
|
fiction
|
||||||
|
field
|
||||||
|
fight
|
||||||
|
finger
|
||||||
|
fire
|
||||||
|
first
|
||||||
|
fish
|
||||||
|
fixed
|
||||||
|
flag
|
||||||
|
flame
|
||||||
|
flat
|
||||||
|
flight
|
||||||
|
floor
|
||||||
|
flower
|
||||||
|
fly
|
||||||
|
fold
|
||||||
|
food
|
||||||
|
foolish
|
||||||
|
foot
|
||||||
|
for
|
||||||
|
force
|
||||||
|
fork
|
||||||
|
form
|
||||||
|
forward
|
||||||
|
fowl
|
||||||
|
frame
|
||||||
|
free
|
||||||
|
frequent
|
||||||
|
friend
|
||||||
|
from
|
||||||
|
front
|
||||||
|
fruit
|
||||||
|
full
|
||||||
|
future
|
||||||
|
garden
|
||||||
|
general
|
||||||
|
get
|
||||||
|
girl
|
||||||
|
give
|
||||||
|
glass
|
||||||
|
glove
|
||||||
|
go
|
||||||
|
goat
|
||||||
|
gold
|
||||||
|
good
|
||||||
|
government
|
||||||
|
grain
|
||||||
|
grass
|
||||||
|
great
|
||||||
|
green
|
||||||
|
grey
|
||||||
|
grip
|
||||||
|
group
|
||||||
|
growth
|
||||||
|
guide
|
||||||
|
gun
|
||||||
|
hair
|
||||||
|
hammer
|
||||||
|
hand
|
||||||
|
hanging
|
||||||
|
happy
|
||||||
|
harbour
|
||||||
|
hard
|
||||||
|
harmony
|
||||||
|
hat
|
||||||
|
hate
|
||||||
|
have
|
||||||
|
he
|
||||||
|
head
|
||||||
|
healthy
|
||||||
|
hear
|
||||||
|
hearing
|
||||||
|
heart
|
||||||
|
heat
|
||||||
|
help
|
||||||
|
high
|
||||||
|
history
|
||||||
|
hole
|
||||||
|
hollow
|
||||||
|
hook
|
||||||
|
hope
|
||||||
|
horn
|
||||||
|
horse
|
||||||
|
hospital
|
||||||
|
hour
|
||||||
|
house
|
||||||
|
how
|
||||||
|
humour
|
||||||
|
I
|
||||||
|
ice
|
||||||
|
idea
|
||||||
|
if
|
||||||
|
ill
|
||||||
|
important
|
||||||
|
impulse
|
||||||
|
in
|
||||||
|
increase
|
||||||
|
industry
|
||||||
|
ink
|
||||||
|
insect
|
||||||
|
instrument
|
||||||
|
insurance
|
||||||
|
interest
|
||||||
|
invention
|
||||||
|
iron
|
||||||
|
island
|
||||||
|
jelly
|
||||||
|
jewel
|
||||||
|
join
|
||||||
|
journey
|
||||||
|
judge
|
||||||
|
jump
|
||||||
|
keep
|
||||||
|
kettle
|
||||||
|
key
|
||||||
|
kick
|
||||||
|
kind
|
||||||
|
kiss
|
||||||
|
knee
|
||||||
|
knife
|
||||||
|
knot
|
||||||
|
knowledge
|
||||||
|
land
|
||||||
|
language
|
||||||
|
last
|
||||||
|
late
|
||||||
|
laugh
|
||||||
|
law
|
||||||
|
lead
|
||||||
|
leaf
|
||||||
|
learning
|
||||||
|
leather
|
||||||
|
left
|
||||||
|
leg
|
||||||
|
let
|
||||||
|
letter
|
||||||
|
level
|
||||||
|
library
|
||||||
|
lift
|
||||||
|
light
|
||||||
|
like
|
||||||
|
limit
|
||||||
|
line
|
||||||
|
linen
|
||||||
|
lip
|
||||||
|
liquid
|
||||||
|
list
|
||||||
|
little
|
||||||
|
living
|
||||||
|
lock
|
||||||
|
long
|
||||||
|
look
|
||||||
|
loose
|
||||||
|
loss
|
||||||
|
loud
|
||||||
|
love
|
||||||
|
low
|
||||||
|
machine
|
||||||
|
make
|
||||||
|
male
|
||||||
|
man
|
||||||
|
manager
|
||||||
|
map
|
||||||
|
mark
|
||||||
|
market
|
||||||
|
married
|
||||||
|
mass
|
||||||
|
match
|
||||||
|
material
|
||||||
|
may
|
||||||
|
meal
|
||||||
|
measure
|
||||||
|
meat
|
||||||
|
medical
|
||||||
|
meeting
|
||||||
|
memory
|
||||||
|
metal
|
||||||
|
middle
|
||||||
|
military
|
||||||
|
milk
|
||||||
|
mind
|
||||||
|
mine
|
||||||
|
minute
|
||||||
|
mist
|
||||||
|
mixed
|
||||||
|
money
|
||||||
|
monkey
|
||||||
|
month
|
||||||
|
moon
|
||||||
|
morning
|
||||||
|
mother
|
||||||
|
motion
|
||||||
|
mountain
|
||||||
|
mouth
|
||||||
|
move
|
||||||
|
much
|
||||||
|
muscle
|
||||||
|
music
|
||||||
|
nail
|
||||||
|
name
|
||||||
|
narrow
|
||||||
|
nation
|
||||||
|
natural
|
||||||
|
near
|
||||||
|
necessary
|
||||||
|
neck
|
||||||
|
need
|
||||||
|
needle
|
||||||
|
nerve
|
||||||
|
net
|
||||||
|
new
|
||||||
|
news
|
||||||
|
night
|
||||||
|
no
|
||||||
|
noise
|
||||||
|
normal
|
||||||
|
north
|
||||||
|
nose
|
||||||
|
not
|
||||||
|
note
|
||||||
|
now
|
||||||
|
number
|
||||||
|
nut
|
||||||
|
observation
|
||||||
|
of
|
||||||
|
off
|
||||||
|
offer
|
||||||
|
office
|
||||||
|
oil
|
||||||
|
old
|
||||||
|
on
|
||||||
|
only
|
||||||
|
open
|
||||||
|
operation
|
||||||
|
opinion
|
||||||
|
opposite
|
||||||
|
or
|
||||||
|
orange
|
||||||
|
order
|
||||||
|
organization
|
||||||
|
ornament
|
||||||
|
other
|
||||||
|
out
|
||||||
|
oven
|
||||||
|
over
|
||||||
|
owner
|
||||||
|
page
|
||||||
|
pain
|
||||||
|
paint
|
||||||
|
paper
|
||||||
|
parallel
|
||||||
|
parcel
|
||||||
|
part
|
||||||
|
past
|
||||||
|
paste
|
||||||
|
payment
|
||||||
|
peace
|
||||||
|
pen
|
||||||
|
pencil
|
||||||
|
person
|
||||||
|
physical
|
||||||
|
picture
|
||||||
|
pig
|
||||||
|
pin
|
||||||
|
pipe
|
||||||
|
place
|
||||||
|
plane
|
||||||
|
plant
|
||||||
|
plate
|
||||||
|
play
|
||||||
|
please
|
||||||
|
pleasure
|
||||||
|
plough
|
||||||
|
pocket
|
||||||
|
point
|
||||||
|
poison
|
||||||
|
polish
|
||||||
|
political
|
||||||
|
poor
|
||||||
|
porter
|
||||||
|
position
|
||||||
|
possible
|
||||||
|
pot
|
||||||
|
potato
|
||||||
|
powder
|
||||||
|
power
|
||||||
|
present
|
||||||
|
price
|
||||||
|
print
|
||||||
|
prison
|
||||||
|
private
|
||||||
|
probable
|
||||||
|
process
|
||||||
|
produce
|
||||||
|
profit
|
||||||
|
property
|
||||||
|
prose
|
||||||
|
protest
|
||||||
|
public
|
||||||
|
pull
|
||||||
|
pump
|
||||||
|
punishment
|
||||||
|
purpose
|
||||||
|
push
|
||||||
|
put
|
||||||
|
quality
|
||||||
|
question
|
||||||
|
quick
|
||||||
|
quiet
|
||||||
|
quite
|
||||||
|
rail
|
||||||
|
rain
|
||||||
|
range
|
||||||
|
rat
|
||||||
|
rate
|
||||||
|
ray
|
||||||
|
reaction
|
||||||
|
reading
|
||||||
|
ready
|
||||||
|
reason
|
||||||
|
receipt
|
||||||
|
record
|
||||||
|
red
|
||||||
|
regret
|
||||||
|
regular
|
||||||
|
relation
|
||||||
|
religion
|
||||||
|
representative
|
||||||
|
request
|
||||||
|
respect
|
||||||
|
responsible
|
||||||
|
rest
|
||||||
|
reward
|
||||||
|
rhythm
|
||||||
|
rice
|
||||||
|
right
|
||||||
|
ring
|
||||||
|
river
|
||||||
|
road
|
||||||
|
rod
|
||||||
|
roll
|
||||||
|
roof
|
||||||
|
room
|
||||||
|
root
|
||||||
|
rough
|
||||||
|
round
|
||||||
|
rub
|
||||||
|
rule
|
||||||
|
run
|
||||||
|
sad
|
||||||
|
safe
|
||||||
|
sail
|
||||||
|
salt
|
||||||
|
same
|
||||||
|
sand
|
||||||
|
say
|
||||||
|
scale
|
||||||
|
school
|
||||||
|
science
|
||||||
|
scissors
|
||||||
|
screw
|
||||||
|
sea
|
||||||
|
seat
|
||||||
|
second
|
||||||
|
secret
|
||||||
|
secretary
|
||||||
|
see
|
||||||
|
seed
|
||||||
|
seem
|
||||||
|
selection
|
||||||
|
self
|
||||||
|
send
|
||||||
|
sense
|
||||||
|
separate
|
||||||
|
serious
|
||||||
|
servant
|
||||||
|
sex
|
||||||
|
shade
|
||||||
|
shake
|
||||||
|
shame
|
||||||
|
sharp
|
||||||
|
sheep
|
||||||
|
shelf
|
||||||
|
ship
|
||||||
|
shirt
|
||||||
|
shock
|
||||||
|
shoe
|
||||||
|
short
|
||||||
|
shut
|
||||||
|
side
|
||||||
|
sign
|
||||||
|
silk
|
||||||
|
silver
|
||||||
|
simple
|
||||||
|
sister
|
||||||
|
size
|
||||||
|
skin
|
||||||
|
|
||||||
|
skirt
|
||||||
|
sky
|
||||||
|
sleep
|
||||||
|
slip
|
||||||
|
slope
|
||||||
|
slow
|
||||||
|
small
|
||||||
|
smash
|
||||||
|
smell
|
||||||
|
smile
|
||||||
|
smoke
|
||||||
|
smooth
|
||||||
|
snake
|
||||||
|
sneeze
|
||||||
|
snow
|
||||||
|
so
|
||||||
|
soap
|
||||||
|
society
|
||||||
|
sock
|
||||||
|
soft
|
||||||
|
solid
|
||||||
|
some
|
||||||
|
|
||||||
|
son
|
||||||
|
song
|
||||||
|
sort
|
||||||
|
sound
|
||||||
|
soup
|
||||||
|
south
|
||||||
|
space
|
||||||
|
spade
|
||||||
|
special
|
||||||
|
sponge
|
||||||
|
spoon
|
||||||
|
spring
|
||||||
|
square
|
||||||
|
stage
|
||||||
|
stamp
|
||||||
|
star
|
||||||
|
start
|
||||||
|
statement
|
||||||
|
station
|
||||||
|
steam
|
||||||
|
steel
|
||||||
|
stem
|
||||||
|
step
|
||||||
|
stick
|
||||||
|
sticky
|
||||||
|
stiff
|
||||||
|
still
|
||||||
|
stitch
|
||||||
|
stocking
|
||||||
|
stomach
|
||||||
|
stone
|
||||||
|
stop
|
||||||
|
store
|
||||||
|
story
|
||||||
|
straight
|
||||||
|
strange
|
||||||
|
street
|
||||||
|
stretch
|
||||||
|
strong
|
||||||
|
structure
|
||||||
|
substance
|
||||||
|
such
|
||||||
|
sudden
|
||||||
|
sugar
|
||||||
|
suggestion
|
||||||
|
summer
|
||||||
|
sun
|
||||||
|
support
|
||||||
|
surprise
|
||||||
|
sweet
|
||||||
|
swim
|
||||||
|
system
|
||||||
|
table
|
||||||
|
tail
|
||||||
|
take
|
||||||
|
talk
|
||||||
|
tall
|
||||||
|
taste
|
||||||
|
tax
|
||||||
|
teaching
|
||||||
|
tendency
|
||||||
|
test
|
||||||
|
than
|
||||||
|
that
|
||||||
|
the
|
||||||
|
then
|
||||||
|
theory
|
||||||
|
there
|
||||||
|
thick
|
||||||
|
thin
|
||||||
|
thing
|
||||||
|
this
|
||||||
|
thought
|
||||||
|
thread
|
||||||
|
throat
|
||||||
|
through
|
||||||
|
through
|
||||||
|
thumb
|
||||||
|
thunder
|
||||||
|
ticket
|
||||||
|
tight
|
||||||
|
till
|
||||||
|
time
|
||||||
|
tin
|
||||||
|
tired
|
||||||
|
to
|
||||||
|
toe
|
||||||
|
together
|
||||||
|
tomorrow
|
||||||
|
tongue
|
||||||
|
tooth
|
||||||
|
top
|
||||||
|
touch
|
||||||
|
town
|
||||||
|
trade
|
||||||
|
train
|
||||||
|
transport
|
||||||
|
tray
|
||||||
|
tree
|
||||||
|
trick
|
||||||
|
trouble
|
||||||
|
trousers
|
||||||
|
true
|
||||||
|
turn
|
||||||
|
twist
|
||||||
|
umbrella
|
||||||
|
under
|
||||||
|
unit
|
||||||
|
up
|
||||||
|
use
|
||||||
|
value
|
||||||
|
verse
|
||||||
|
very
|
||||||
|
vessel
|
||||||
|
view
|
||||||
|
violent
|
||||||
|
voice
|
||||||
|
waiting
|
||||||
|
walk
|
||||||
|
wall
|
||||||
|
war
|
||||||
|
warm
|
||||||
|
wash
|
||||||
|
waste
|
||||||
|
watch
|
||||||
|
water
|
||||||
|
wave
|
||||||
|
wax
|
||||||
|
way
|
||||||
|
weather
|
||||||
|
week
|
||||||
|
weight
|
||||||
|
well
|
||||||
|
west
|
||||||
|
wet
|
||||||
|
wheel
|
||||||
|
when
|
||||||
|
where
|
||||||
|
while
|
||||||
|
whip
|
||||||
|
whistle
|
||||||
|
white
|
||||||
|
who
|
||||||
|
why
|
||||||
|
wide
|
||||||
|
will
|
||||||
|
wind
|
||||||
|
window
|
||||||
|
wine
|
||||||
|
wing
|
||||||
|
winter
|
||||||
|
wire
|
||||||
|
wise
|
||||||
|
with
|
||||||
|
woman
|
||||||
|
wood
|
||||||
|
wool
|
||||||
|
word
|
||||||
|
work
|
||||||
|
worm
|
||||||
|
wound
|
||||||
|
writing
|
||||||
|
wrong
|
||||||
|
year
|
||||||
|
yellow
|
||||||
|
yes
|
||||||
|
yesterday
|
||||||
|
you
|
||||||
|
young
|
16
contrib/wordnet-blast/check/check.sh
Normal file
16
contrib/wordnet-blast/check/check.sh
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
WNHOME=/usr/share/wordnet/
|
||||||
|
|
||||||
|
check() {
|
||||||
|
local word_list="$1"
|
||||||
|
echo "./bin/wntest $WNHOME ${word_list}"
|
||||||
|
time ./bin/wntest $WNHOME ${word_list} > ${word_list}.blast
|
||||||
|
echo "for i in \`cat ${word_list}\`; do wn $i -over; done"
|
||||||
|
time for i in `cat ${word_list}`; do wn $i -over; done > ${word_list}.wn
|
||||||
|
|
||||||
|
echo "diff ${word_list}.wn ${word_list}.blast -b"
|
||||||
|
colordiff -y ${word_list}.wn ${word_list}.blast -b
|
||||||
|
}
|
||||||
|
|
||||||
|
check "$1"
|
7
contrib/wordnet-blast/check/list.txt
Normal file
7
contrib/wordnet-blast/check/list.txt
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
cat
|
||||||
|
lions
|
||||||
|
city
|
||||||
|
building
|
||||||
|
salvation
|
||||||
|
medications
|
||||||
|
haven
|
72
contrib/wordnet-blast/wnb/bfs.hh
Normal file
72
contrib/wordnet-blast/wnb/bfs.hh
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
#ifndef _BFS_HH
|
||||||
|
# define _BFS_HH
|
||||||
|
|
||||||
|
# include <boost/graph/breadth_first_search.hpp>
|
||||||
|
# include <boost/graph/filtered_graph.hpp>
|
||||||
|
|
||||||
|
namespace wnb
|
||||||
|
{
|
||||||
|
struct synset;
|
||||||
|
|
||||||
|
namespace bfs // breadth first search tools
|
||||||
|
{
|
||||||
|
/// bfs_visitor
|
||||||
|
/// Sum distances and throw answer if target synset found
|
||||||
|
template <typename DistanceMap>
|
||||||
|
class distance_recorder : public boost::default_bfs_visitor
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
distance_recorder(DistanceMap dist, const synset& s, int max)
|
||||||
|
: d(dist), target(s), max_length(max)
|
||||||
|
{ }
|
||||||
|
|
||||||
|
template <typename Edge, typename Graph>
|
||||||
|
void tree_edge(Edge e, const Graph& g) const
|
||||||
|
{
|
||||||
|
typename boost::graph_traits<Graph>::vertex_descriptor
|
||||||
|
u = boost::source(e, g), v = boost::target(e, g);
|
||||||
|
d[v] = d[u] + 1;
|
||||||
|
|
||||||
|
if (g[v] == target)
|
||||||
|
throw d[v];
|
||||||
|
if (d[v] > max_length)
|
||||||
|
throw -1;
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
DistanceMap d;
|
||||||
|
const synset& target;
|
||||||
|
int max_length;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Convenience function
|
||||||
|
template <typename DistanceMap>
|
||||||
|
distance_recorder<DistanceMap>
|
||||||
|
record_distance(DistanceMap d, const synset& s, int m)
|
||||||
|
{
|
||||||
|
return distance_recorder<DistanceMap>(d, s, m);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This predicate function object determines which edges of the original
|
||||||
|
/// graph will show up in the filtered graph.
|
||||||
|
//FIXME: Do we really need a map here (check cost of property_map construction
|
||||||
|
// / should be light)
|
||||||
|
template <typename PointerSymbolMap>
|
||||||
|
struct hypo_hyper_edge {
|
||||||
|
hypo_hyper_edge() { }
|
||||||
|
hypo_hyper_edge(PointerSymbolMap pointer_symbol)
|
||||||
|
: m_pointer_symbol(pointer_symbol) { }
|
||||||
|
template <typename Edge>
|
||||||
|
bool operator()(const Edge& e) const {
|
||||||
|
int p_s = get(m_pointer_symbol, e);
|
||||||
|
//see pointer symbol list in info_helper.hh
|
||||||
|
return p_s == 1 || p_s == 2 || p_s == 3 || p_s == 4;
|
||||||
|
}
|
||||||
|
PointerSymbolMap m_pointer_symbol;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // end of wnb::bfs
|
||||||
|
|
||||||
|
} // end of namespace wnb
|
||||||
|
|
||||||
|
#endif /* _BFS_HH */
|
||||||
|
|
148
contrib/wordnet-blast/wnb/core/info_helper.cc
Normal file
148
contrib/wordnet-blast/wnb/core/info_helper.cc
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
#include "info_helper.hh"
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
|
||||||
|
namespace wnb
|
||||||
|
{
|
||||||
|
|
||||||
|
// Class info_helper
|
||||||
|
|
||||||
|
/// List of pointer symbols
|
||||||
|
const char *
|
||||||
|
info_helper::symbols[info_helper::NB_SYMBOLS] = {
|
||||||
|
"!" , // 0 Antonym
|
||||||
|
"@" , // 1 Hypernym
|
||||||
|
"@i", // 2 Instance Hypernym
|
||||||
|
"~" , // 3 Hyponym
|
||||||
|
"~i", // 4 Instance Hyponym
|
||||||
|
"#m", // 5 Member holonym
|
||||||
|
"#s", // 6 Substance holonym
|
||||||
|
"#p", // 7 Part holonym
|
||||||
|
"%m", // 8 Member meronym
|
||||||
|
"%s", // 9 Substance meronym
|
||||||
|
"%p", // 10 Part meronym
|
||||||
|
"=" , // 11 Attribute
|
||||||
|
"+" , // 12 Derivationally related form
|
||||||
|
";c", // 13 Domain of synset - TOPIC
|
||||||
|
"-c", // 14 Member of this domain - TOPIC
|
||||||
|
";r", // 15 Domain of synset - REGION
|
||||||
|
"-r", // 16 Member of this domain - REGION
|
||||||
|
";u", // 17 Domain of synset - USAGE
|
||||||
|
"-u", // 18 Member of this domain - USAGE
|
||||||
|
|
||||||
|
//The pointer_symbol s for verbs are:
|
||||||
|
"*", // 19 Entailment
|
||||||
|
">", // 20 Cause
|
||||||
|
"^", // 21 Also see
|
||||||
|
"$", // 22 Verb Group
|
||||||
|
|
||||||
|
//The pointer_symbol s for adjectives are:
|
||||||
|
"&", // 23 Similar to
|
||||||
|
"<", // 24 Participle of verb
|
||||||
|
"\\", // 25 Pertainym (pertains to noun)
|
||||||
|
"=", // 26 Attribute
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::string info_helper::sufx[] = {
|
||||||
|
/* Noun suffixes */
|
||||||
|
"s", "ses", "xes", "zes", "ches", "shes", "men", "ies",
|
||||||
|
/* Verb suffixes */
|
||||||
|
"s", "ies", "es", "es", "ed", "ed", "ing", "ing",
|
||||||
|
/* Adjective suffixes */
|
||||||
|
"er", "est", "er", "est"
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::string info_helper::addr[] = {
|
||||||
|
/* Noun endings */
|
||||||
|
"", "s", "x", "z", "ch", "sh", "man", "y",
|
||||||
|
/* Verb endings */
|
||||||
|
"", "y", "e", "", "e", "", "e", "",
|
||||||
|
/* Adjective endings */
|
||||||
|
"", "", "e", "e"
|
||||||
|
};
|
||||||
|
|
||||||
|
const int info_helper::offsets[info_helper::NUMPARTS] = { 0, 0, 8, 16, 0, 0 };
|
||||||
|
const int info_helper::cnts[info_helper::NUMPARTS] = { 0, 8, 8, 4, 0, 0 };
|
||||||
|
|
||||||
|
void
|
||||||
|
info_helper::update_pos_maps()
|
||||||
|
{
|
||||||
|
// http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3
|
||||||
|
|
||||||
|
indice_offset[UNKNOWN] = 0;
|
||||||
|
|
||||||
|
indice_offset[N] = 0;
|
||||||
|
indice_offset[V] = indice_offset[N] + pos_maps[N].size();
|
||||||
|
indice_offset[A] = indice_offset[V] + pos_maps[V].size();
|
||||||
|
indice_offset[R] = indice_offset[A] + pos_maps[A].size();
|
||||||
|
indice_offset[S] = indice_offset[R] + pos_maps[R].size();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
int info_helper::compute_indice(int offset, pos_t pos)
|
||||||
|
{
|
||||||
|
if (pos == S)
|
||||||
|
pos = A;
|
||||||
|
std::map<int,int>& map = pos_maps[pos];
|
||||||
|
|
||||||
|
assert(pos <= 5 && pos > 0);
|
||||||
|
|
||||||
|
return indice_offset[pos] + map[offset];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function definitions
|
||||||
|
|
||||||
|
// Return relation between synset indices and offsets
|
||||||
|
static
|
||||||
|
std::map<int,int>
|
||||||
|
preprocess_data(const std::string& fn)
|
||||||
|
{
|
||||||
|
std::map<int,int> map;
|
||||||
|
std::ifstream file(fn.c_str());
|
||||||
|
if (!file.is_open())
|
||||||
|
throw std::runtime_error("preprocess_data: File not found: " + fn);
|
||||||
|
|
||||||
|
std::string row;
|
||||||
|
|
||||||
|
//skip header
|
||||||
|
const unsigned int header_nb_lines = 29;
|
||||||
|
for(std::size_t i = 0; i < header_nb_lines; i++)
|
||||||
|
std::getline(file, row);
|
||||||
|
|
||||||
|
int ind = 0;
|
||||||
|
//parse data line
|
||||||
|
while (std::getline(file, row))
|
||||||
|
{
|
||||||
|
std::stringstream srow(row);
|
||||||
|
int offset;
|
||||||
|
srow >> offset;
|
||||||
|
map.insert(std::pair<int,int>(offset, ind));
|
||||||
|
ind++;
|
||||||
|
}
|
||||||
|
|
||||||
|
file.close();
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
info_helper
|
||||||
|
preprocess_wordnet(const std::string& dn)
|
||||||
|
{
|
||||||
|
info_helper info;
|
||||||
|
|
||||||
|
info.pos_maps[N] = preprocess_data((dn + "data.noun")); // noun_map
|
||||||
|
info.pos_maps[V] = preprocess_data((dn + "data.verb")); // verb_map
|
||||||
|
info.pos_maps[A] = preprocess_data((dn + "data.adj")); // adj_map
|
||||||
|
info.pos_maps[R] = preprocess_data((dn + "data.adv")); // adv_map
|
||||||
|
|
||||||
|
info.update_pos_maps();
|
||||||
|
|
||||||
|
return info;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // end of namespace wnb
|
||||||
|
|
85
contrib/wordnet-blast/wnb/core/info_helper.hh
Normal file
85
contrib/wordnet-blast/wnb/core/info_helper.hh
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
# include <string>
|
||||||
|
# include <stdexcept>
|
||||||
|
# include <map>
|
||||||
|
|
||||||
|
# include "pos_t.hh"
|
||||||
|
|
||||||
|
namespace wnb
|
||||||
|
{
|
||||||
|
|
||||||
|
/// Useful information for wordnet in-memory import
|
||||||
|
struct info_helper
|
||||||
|
{
|
||||||
|
/// Symbols' size
|
||||||
|
static const std::size_t NB_SYMBOLS = 27;
|
||||||
|
static const std::size_t NUMPARTS = POS_ARRAY_SIZE;
|
||||||
|
|
||||||
|
/// List of pointer symbols
|
||||||
|
static const char * symbols[NB_SYMBOLS];
|
||||||
|
static const std::string sufx[];
|
||||||
|
static const std::string addr[];
|
||||||
|
|
||||||
|
static const int offsets[NUMPARTS];
|
||||||
|
static const int cnts[NUMPARTS];
|
||||||
|
|
||||||
|
typedef std::map<int,int> i2of_t; ///< indice/offset correspondences
|
||||||
|
typedef std::map<pos_t, i2of_t> pos_i2of_t; ///< pos / map correspondences
|
||||||
|
|
||||||
|
/// Constructor
|
||||||
|
info_helper() { update_pos_maps(); }
|
||||||
|
|
||||||
|
/// Compute the number of synsets (i.e. the number of vertex in the graph)
|
||||||
|
unsigned nb_synsets()
|
||||||
|
{
|
||||||
|
typedef pos_i2of_t::iterator iter_t;
|
||||||
|
|
||||||
|
int sum = 0;
|
||||||
|
for (iter_t it = pos_maps.begin(); it != pos_maps.end(); it++)
|
||||||
|
sum += (*it).second.size();
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
//return adj_map.size() + adv_map.size() + noun_map.size() + verb_map.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Given a pos return the starting indice in the graph
|
||||||
|
int get_indice_offset(pos_t pos)
|
||||||
|
{
|
||||||
|
return indice_offset[pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper function computing global indice in graph from local offset
|
||||||
|
int compute_indice(int offset, pos_t pos);
|
||||||
|
|
||||||
|
/// Update a map allowing one to get the correct map given a pos
|
||||||
|
void update_pos_maps();
|
||||||
|
|
||||||
|
int get_symbol(const std::string& ps)
|
||||||
|
{
|
||||||
|
for (unsigned i = 0; i < NB_SYMBOLS; i++)
|
||||||
|
if (ps == symbols[i])
|
||||||
|
return i;
|
||||||
|
throw std::runtime_error("Symbol NOT FOUND.");
|
||||||
|
}
|
||||||
|
|
||||||
|
pos_t get_pos(const char& c)
|
||||||
|
{
|
||||||
|
return get_pos_from_char(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
// i2of_t adj_map;
|
||||||
|
// i2of_t adv_map;
|
||||||
|
// i2of_t noun_map;
|
||||||
|
// i2of_t verb_map;
|
||||||
|
|
||||||
|
pos_i2of_t pos_maps;
|
||||||
|
std::size_t indice_offset[POS_ARRAY_SIZE];
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Create a new info_help based on wordnet data located in dn (../dict/)
|
||||||
|
info_helper preprocess_wordnet(const std::string& dn);
|
||||||
|
|
||||||
|
} // end of namespace wncpp
|
381
contrib/wordnet-blast/wnb/core/load_wordnet.cc
Normal file
381
contrib/wordnet-blast/wnb/core/load_wordnet.cc
Normal file
@ -0,0 +1,381 @@
|
|||||||
|
#include "load_wordnet.hh"
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#include <boost/graph/adjacency_list.hpp>
|
||||||
|
#include <boost/progress.hpp>
|
||||||
|
#include <boost/algorithm/string.hpp>
|
||||||
|
|
||||||
|
#include <wnb/std_ext.hh>
|
||||||
|
|
||||||
|
#include "wordnet.hh"
|
||||||
|
#include "info_helper.hh"
|
||||||
|
#include "pos_t.hh"
|
||||||
|
|
||||||
|
namespace bg = boost::graph;
|
||||||
|
|
||||||
|
namespace wnb
|
||||||
|
{
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
|
// Load synset's words
|
||||||
|
void load_data_row_words(std::stringstream& srow, synset& synset)
|
||||||
|
{
|
||||||
|
srow >> std::hex >> synset.w_cnt >> std::dec;
|
||||||
|
for (std::size_t i = 0; i < synset.w_cnt; i++)
|
||||||
|
{
|
||||||
|
//word lex_id
|
||||||
|
|
||||||
|
std::string word;
|
||||||
|
srow >> word;
|
||||||
|
synset.words.push_back(word);
|
||||||
|
|
||||||
|
int lex_id;
|
||||||
|
srow >> std::hex >> lex_id >> std::dec;
|
||||||
|
synset.lex_ids.push_back(lex_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add rel to graph
|
||||||
|
void add_wordnet_rel(std::string& pointer_symbol_,// type of relation
|
||||||
|
int synset_offset, // dest offset
|
||||||
|
pos_t pos, // p.o.s. of dest
|
||||||
|
int src, // word src
|
||||||
|
int trgt, // word target
|
||||||
|
synset& synset, // source synset
|
||||||
|
wordnet& wn, // our wordnet
|
||||||
|
info_helper& info) // helper
|
||||||
|
{
|
||||||
|
//if (pos == S || synset.pos == S)
|
||||||
|
// return; //FIXME: check where are s synsets.
|
||||||
|
|
||||||
|
int u = synset.id;
|
||||||
|
int v = info.compute_indice(synset_offset, pos);
|
||||||
|
|
||||||
|
ptr p;
|
||||||
|
p.pointer_symbol = info.get_symbol(pointer_symbol_);
|
||||||
|
p.source = src;
|
||||||
|
p.target = trgt;
|
||||||
|
|
||||||
|
boost::add_edge(u,v, p, wn.wordnet_graph);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// load ptrs
|
||||||
|
void load_data_row_ptrs(std::stringstream& srow, synset& synset,
|
||||||
|
wordnet& wn, info_helper& info)
|
||||||
|
{
|
||||||
|
srow >> synset.p_cnt;
|
||||||
|
for (std::size_t i = 0; i < synset.p_cnt; i++)
|
||||||
|
{
|
||||||
|
//http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3
|
||||||
|
//pointer_symbol synset_offset pos source/target
|
||||||
|
std::string pointer_symbol_;
|
||||||
|
int synset_offset;
|
||||||
|
pos_t pos;
|
||||||
|
int src;
|
||||||
|
int trgt;
|
||||||
|
|
||||||
|
srow >> pointer_symbol_;
|
||||||
|
srow >> synset_offset;
|
||||||
|
|
||||||
|
char c;
|
||||||
|
srow >> c;
|
||||||
|
pos = info.get_pos(c);
|
||||||
|
|
||||||
|
//print extracted edges
|
||||||
|
//std::cout << "(" << pointer_symbol << ", " << synset_offset;
|
||||||
|
//std::cout << ", " << pos << ")" << std::endl;
|
||||||
|
|
||||||
|
// Extract source/target words info
|
||||||
|
std::string src_trgt;
|
||||||
|
srow >> src_trgt;
|
||||||
|
std::stringstream ssrc(std::string(src_trgt,0,2));
|
||||||
|
std::stringstream strgt(std::string(src_trgt,2,2));
|
||||||
|
ssrc >> std::hex >> src >> std::dec;
|
||||||
|
strgt >> std::hex >> trgt >> std::dec;
|
||||||
|
|
||||||
|
add_wordnet_rel(pointer_symbol_, synset_offset, pos, src, trgt, synset, wn, info);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Load a synset and add it to the wordnet class.
|
||||||
|
void load_data_row(const std::string& row, wordnet& wn, info_helper& info)
|
||||||
|
{
|
||||||
|
//http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3
|
||||||
|
// synset_offset lex_filenum ss_type w_cnt word lex_id [word lex_id...] p_cnt [ptr...] [frames...] | gloss
|
||||||
|
synset synset;
|
||||||
|
|
||||||
|
std::stringstream srow(row);
|
||||||
|
int synset_offset;
|
||||||
|
srow >> synset_offset;
|
||||||
|
srow >> synset.lex_filenum;
|
||||||
|
char ss_type;
|
||||||
|
srow >> ss_type;
|
||||||
|
|
||||||
|
// extra information
|
||||||
|
synset.pos = info.get_pos(ss_type);
|
||||||
|
synset.id = info.compute_indice(synset_offset, synset.pos);
|
||||||
|
|
||||||
|
// words
|
||||||
|
load_data_row_words(srow, synset);
|
||||||
|
|
||||||
|
// ptrs
|
||||||
|
load_data_row_ptrs(srow, synset, wn, info);
|
||||||
|
|
||||||
|
//frames (skipped)
|
||||||
|
std::string tmp;
|
||||||
|
while (srow >> tmp)
|
||||||
|
if (tmp == "|")
|
||||||
|
break;
|
||||||
|
|
||||||
|
// gloss
|
||||||
|
std::getline(srow, synset.gloss);
|
||||||
|
|
||||||
|
// extra
|
||||||
|
synset.sense_number = 0;
|
||||||
|
|
||||||
|
// Add synset to graph
|
||||||
|
wn.wordnet_graph[synset.id] = synset;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Parse data.noun files
|
||||||
|
void load_wordnet_data(const std::string& fn, wordnet& wn, info_helper& info)
|
||||||
|
{
|
||||||
|
std::ifstream fin(fn.c_str());
|
||||||
|
if (!fin.is_open())
|
||||||
|
throw std::runtime_error("File missing: " + fn);
|
||||||
|
|
||||||
|
static const int MAX_LENGTH = 20480;
|
||||||
|
char row[MAX_LENGTH];
|
||||||
|
|
||||||
|
//skip header
|
||||||
|
for(unsigned i = 0; i < 29; i++)
|
||||||
|
fin.getline(row, MAX_LENGTH);
|
||||||
|
|
||||||
|
//parse data line
|
||||||
|
while (fin.getline(row, MAX_LENGTH))
|
||||||
|
load_data_row(row, wn, info);
|
||||||
|
|
||||||
|
fin.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//FIXME: It seems possible to replace synset_offsets with indice here.
|
||||||
|
void load_index_row(const std::string& row, wordnet& wn, info_helper& info)
|
||||||
|
{
|
||||||
|
// lemma pos synset_cnt p_cnt [ptr_symbol...] sense_cnt tagsense_cnt synset_offset [synset_offset...]
|
||||||
|
index index;
|
||||||
|
std::stringstream srow(row);
|
||||||
|
|
||||||
|
char pos;
|
||||||
|
srow >> index.lemma;
|
||||||
|
srow >> pos;
|
||||||
|
index.pos = info.get_pos(pos); // extra data
|
||||||
|
srow >> index.synset_cnt;
|
||||||
|
srow >> index.p_cnt;
|
||||||
|
|
||||||
|
std::string tmp_p;
|
||||||
|
for (std::size_t i = 0; i < index.p_cnt; i++)
|
||||||
|
{
|
||||||
|
srow >> tmp_p;
|
||||||
|
index.ptr_symbols.push_back(tmp_p);
|
||||||
|
}
|
||||||
|
srow >> index.sense_cnt;
|
||||||
|
srow >> index.tagsense_cnt;
|
||||||
|
|
||||||
|
int tmp_o;
|
||||||
|
while (srow >> tmp_o)
|
||||||
|
{
|
||||||
|
index.synset_offsets.push_back(tmp_o);
|
||||||
|
index.synset_ids.push_back(info.compute_indice(tmp_o, index.pos)); // extra data
|
||||||
|
}
|
||||||
|
|
||||||
|
//add synset to index list
|
||||||
|
wn.index_list.push_back(index);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void load_wordnet_index(const std::string& fn, wordnet& wn, info_helper& info)
|
||||||
|
{
|
||||||
|
std::ifstream fin(fn.c_str());
|
||||||
|
if (!fin.is_open())
|
||||||
|
throw std::runtime_error("File Not Found: " + fn);
|
||||||
|
|
||||||
|
static const int MAX_LENGTH = 20480;
|
||||||
|
char row[MAX_LENGTH];
|
||||||
|
|
||||||
|
//skip header
|
||||||
|
const unsigned int header_nb_lines = 29;
|
||||||
|
for(std::size_t i = 0; i < header_nb_lines; i++)
|
||||||
|
fin.getline(row, MAX_LENGTH);
|
||||||
|
|
||||||
|
//parse data line
|
||||||
|
while (fin.getline(row, MAX_LENGTH))
|
||||||
|
load_index_row(row, wn, info);
|
||||||
|
|
||||||
|
fin.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void load_wordnet_exc(const std::string& dn, std::string cat,
|
||||||
|
wordnet& wn, info_helper&)
|
||||||
|
{
|
||||||
|
std::string fn = dn + cat + ".exc";
|
||||||
|
std::ifstream fin(fn.c_str());
|
||||||
|
if (!fin.is_open())
|
||||||
|
throw std::runtime_error("File Not Found: " + fn);
|
||||||
|
|
||||||
|
std::map<std::string,std::string>& exc = wn.exc[get_pos_from_name(cat)];
|
||||||
|
|
||||||
|
std::string row;
|
||||||
|
|
||||||
|
std::string key, value;
|
||||||
|
while (std::getline(fin, row))
|
||||||
|
{
|
||||||
|
std::stringstream srow(row);
|
||||||
|
srow >> key;
|
||||||
|
srow >> value;
|
||||||
|
|
||||||
|
exc[key] = value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void load_wordnet_cat(const std::string dn, std::string cat,
|
||||||
|
wordnet& wn, info_helper& info)
|
||||||
|
{
|
||||||
|
load_wordnet_data((dn + "data." + cat), wn, info);
|
||||||
|
load_wordnet_index((dn + "index." + cat), wn, info);
|
||||||
|
load_wordnet_exc(dn, cat, wn, info);
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: this file is not in all packaged version of wordnet
|
||||||
|
void load_wordnet_index_sense(const std::string& dn, wordnet& wn, info_helper& info)
|
||||||
|
{
|
||||||
|
std::string fn = dn + "index.sense";
|
||||||
|
std::ifstream fin(fn.c_str());
|
||||||
|
if (!fin.is_open())
|
||||||
|
throw std::runtime_error("File Not Found: " + fn);
|
||||||
|
|
||||||
|
std::string row;
|
||||||
|
std::string sense_key;
|
||||||
|
int synset_offset;
|
||||||
|
while (std::getline(fin, row))
|
||||||
|
{
|
||||||
|
std::stringstream srow(row);
|
||||||
|
srow >> sense_key;
|
||||||
|
|
||||||
|
// Get the pos of the lemma
|
||||||
|
std::vector<std::string> sk = ext::split(sense_key,'%');
|
||||||
|
std::string word = sk.at(0);
|
||||||
|
std::stringstream tmp(ext::split(sk.at(1), ':').at(0));
|
||||||
|
int ss_type;
|
||||||
|
tmp >> ss_type;
|
||||||
|
pos_t pos = (pos_t) ss_type;
|
||||||
|
|
||||||
|
srow >> synset_offset;
|
||||||
|
|
||||||
|
// Update synset info
|
||||||
|
int u = info.compute_indice(synset_offset, pos);
|
||||||
|
int sense_number;
|
||||||
|
srow >> sense_number;
|
||||||
|
wn.wordnet_graph[u].sense_number += sense_number;
|
||||||
|
int tag_cnt;
|
||||||
|
srow >> tag_cnt;
|
||||||
|
if (tag_cnt != 0)
|
||||||
|
wn.wordnet_graph[u].tag_cnts.push_back( make_pair(word,tag_cnt) );
|
||||||
|
|
||||||
|
//if (synset_offset == 2121620)
|
||||||
|
// std::cout << u << " " << word << " " << synset_offset << " "
|
||||||
|
// << wn.wordnet_graph[u].tag_cnt << " "
|
||||||
|
// << wn.wordnet_graph[u].words[0] << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// wn -over used info in cntlist even if this is deprecated
|
||||||
|
// It is ok not to FIX and use this function
|
||||||
|
void load_wordnet_cntlist(const std::string& dn, wordnet& wn, info_helper& info)
|
||||||
|
{
|
||||||
|
std::string fn = dn + "cntlist";
|
||||||
|
std::ifstream fin(fn.c_str());
|
||||||
|
if (!fin.is_open())
|
||||||
|
throw std::runtime_error("File Not Found: " + fn);
|
||||||
|
|
||||||
|
std::string sense_key;
|
||||||
|
int sense_number;
|
||||||
|
int tag_cnt;
|
||||||
|
|
||||||
|
std::string row;
|
||||||
|
while (std::getline(fin, row))
|
||||||
|
{
|
||||||
|
std::stringstream srow(row);
|
||||||
|
|
||||||
|
srow >> sense_key;
|
||||||
|
srow >> sense_number;
|
||||||
|
srow >> tag_cnt;
|
||||||
|
|
||||||
|
// Get the pos of the lemma
|
||||||
|
std::string word = ext::split(sense_key,'%').at(0);
|
||||||
|
std::stringstream tmp(ext::split(ext::split(sense_key,'%').at(1), ':').at(0));
|
||||||
|
int ss_type;
|
||||||
|
tmp >> ss_type;
|
||||||
|
pos_t pos = (pos_t) ss_type;
|
||||||
|
|
||||||
|
// Update synset info
|
||||||
|
int synset_offset; // FIXME
|
||||||
|
int u = info.compute_indice(synset_offset, pos);
|
||||||
|
wn.wordnet_graph[u].sense_number += sense_number;
|
||||||
|
if (tag_cnt != 0)
|
||||||
|
wn.wordnet_graph[u].tag_cnts.push_back( make_pair(word,tag_cnt) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // end of anonymous namespace
|
||||||
|
|
||||||
|
void load_wordnet(const std::string& dn, wordnet& wn, info_helper& info)
|
||||||
|
{
|
||||||
|
// vertex added in this order a n r v
|
||||||
|
|
||||||
|
std::string fn = dn;
|
||||||
|
|
||||||
|
if (wn._verbose)
|
||||||
|
{
|
||||||
|
std::cout << std::endl;
|
||||||
|
std::cout << "### Loading Wordnet 3.0";
|
||||||
|
boost::progress_display show_progress(5);
|
||||||
|
boost::progress_timer t;
|
||||||
|
|
||||||
|
load_wordnet_cat(dn, "adj", wn, info);
|
||||||
|
++show_progress;
|
||||||
|
load_wordnet_cat(dn, "noun", wn, info);
|
||||||
|
++show_progress;
|
||||||
|
load_wordnet_cat(dn, "adv", wn, info);
|
||||||
|
++show_progress;
|
||||||
|
load_wordnet_cat(dn, "verb", wn, info);
|
||||||
|
++show_progress;
|
||||||
|
load_wordnet_index_sense(dn, wn, info);
|
||||||
|
++show_progress;
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
load_wordnet_cat(dn, "adj", wn, info);
|
||||||
|
load_wordnet_cat(dn, "noun", wn, info);
|
||||||
|
load_wordnet_cat(dn, "adv", wn, info);
|
||||||
|
load_wordnet_cat(dn, "verb", wn, info);
|
||||||
|
load_wordnet_index_sense(dn, wn, info);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::stable_sort(wn.index_list.begin(), wn.index_list.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
} // end of namespace wnb
|
12
contrib/wordnet-blast/wnb/core/load_wordnet.hh
Normal file
12
contrib/wordnet-blast/wnb/core/load_wordnet.hh
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
# include "info_helper.hh"
|
||||||
|
|
||||||
|
namespace wnb
|
||||||
|
{
|
||||||
|
/// forward declaration
|
||||||
|
struct wordnet;
|
||||||
|
|
||||||
|
/// Load the entire wordnet data base located in \p dn (typically .../dict/)
|
||||||
|
void load_wordnet(const std::string& dn, wordnet& wn, info_helper& info);
|
||||||
|
}
|
61
contrib/wordnet-blast/wnb/core/pos_t.hh
Normal file
61
contrib/wordnet-blast/wnb/core/pos_t.hh
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
namespace wnb
|
||||||
|
{
|
||||||
|
|
||||||
|
static const std::size_t POS_ARRAY_SIZE = 6;
|
||||||
|
static const char POS_ARRAY[POS_ARRAY_SIZE] = {'u', 'n', 'v', 'a', 'r', 's'};
|
||||||
|
|
||||||
|
enum pos_t
|
||||||
|
{
|
||||||
|
UNKNOWN = 0,
|
||||||
|
N = 1,
|
||||||
|
V = 2,
|
||||||
|
A = 3,
|
||||||
|
R = 4,
|
||||||
|
S = 5,
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
inline pos_t get_pos_from_name(const std::string& pos)
|
||||||
|
{
|
||||||
|
if (pos == "adj")
|
||||||
|
return A;
|
||||||
|
if (pos == "noun")
|
||||||
|
return N;
|
||||||
|
if (pos == "adv")
|
||||||
|
return R;
|
||||||
|
if (pos == "verb")
|
||||||
|
return V;
|
||||||
|
if (pos == "adj sat")
|
||||||
|
return S;
|
||||||
|
return UNKNOWN;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::string get_name_from_pos(const pos_t& pos)
|
||||||
|
{
|
||||||
|
switch (pos)
|
||||||
|
{
|
||||||
|
case A: return "adj";
|
||||||
|
case N: return "noun";
|
||||||
|
case R: return "adv";
|
||||||
|
case V: return "verb";
|
||||||
|
case S: return "adj sat";
|
||||||
|
default: return "UNKNOWN";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline pos_t get_pos_from_char(const char& c)
|
||||||
|
{
|
||||||
|
switch (c)
|
||||||
|
{
|
||||||
|
case 'a': return A;
|
||||||
|
case 'n': return N;
|
||||||
|
case 'r': return R;
|
||||||
|
case 'v': return V;
|
||||||
|
case 's': return S;
|
||||||
|
default: return UNKNOWN;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // end of namespace wncpp
|
186
contrib/wordnet-blast/wnb/core/wordnet.cc
Normal file
186
contrib/wordnet-blast/wnb/core/wordnet.cc
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
#include <wnb/core/wordnet.hh>
|
||||||
|
#include <wnb/std_ext.hh>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <set>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <boost/graph/breadth_first_search.hpp>
|
||||||
|
#include <boost/graph/filtered_graph.hpp>
|
||||||
|
|
||||||
|
namespace wnb
|
||||||
|
{
|
||||||
|
|
||||||
|
//FIXME: Make (smart) use of fs::path
|
||||||
|
wordnet::wordnet(const std::string& wordnet_dir, bool verbose)
|
||||||
|
: _verbose(verbose)
|
||||||
|
{
|
||||||
|
if (_verbose)
|
||||||
|
{
|
||||||
|
std::cout << wordnet_dir << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
info = preprocess_wordnet(wordnet_dir);
|
||||||
|
|
||||||
|
wordnet_graph = graph(info.nb_synsets());
|
||||||
|
load_wordnet(wordnet_dir, *this, info);
|
||||||
|
|
||||||
|
if (_verbose)
|
||||||
|
{
|
||||||
|
std::cout << "nb_synsets: " << info.nb_synsets() << std::endl;
|
||||||
|
}
|
||||||
|
//FIXME: this check is only valid for Wordnet 3.0
|
||||||
|
//assert(info.nb_synsets() == 142335);//117659);
|
||||||
|
assert(info.nb_synsets() > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<synset>
|
||||||
|
wordnet::get_synsets(const std::string& word, pos_t pos)
|
||||||
|
{
|
||||||
|
std::vector<synset> synsets;
|
||||||
|
|
||||||
|
// morphing
|
||||||
|
std::string mword = morphword(word, pos);
|
||||||
|
if (mword == "")
|
||||||
|
return synsets;
|
||||||
|
|
||||||
|
// binary_search
|
||||||
|
typedef std::vector<index> vi;
|
||||||
|
std::pair<vi::iterator,vi::iterator> bounds = get_indexes(mword);
|
||||||
|
|
||||||
|
vi::iterator it;
|
||||||
|
for (it = bounds.first; it != bounds.second; it++)
|
||||||
|
{
|
||||||
|
if (pos == pos_t::UNKNOWN || it->pos == pos)
|
||||||
|
{
|
||||||
|
for (std::size_t i = 0; i < it->synset_ids.size(); i++)
|
||||||
|
{
|
||||||
|
int id = it->synset_ids[i];
|
||||||
|
synsets.push_back(wordnet_graph[id]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return synsets;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::vector<std::string> *
|
||||||
|
wordnet::get_synset(const std::string& word, pos_t pos) const {
|
||||||
|
|
||||||
|
typedef std::vector<index> vi;
|
||||||
|
std::pair<vi::const_iterator,vi::const_iterator> bounds = get_indexes_const(word);
|
||||||
|
|
||||||
|
for (vi::const_iterator it = bounds.first; it != bounds.second; it++)
|
||||||
|
{
|
||||||
|
if (pos == pos_t::UNKNOWN || it->pos == pos)
|
||||||
|
{
|
||||||
|
int id = it->synset_ids[0];
|
||||||
|
return &wordnet_graph[id].words;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<std::vector<index>::const_iterator, std::vector<index>::const_iterator>
|
||||||
|
wordnet::get_indexes_const(const std::string& word) const
|
||||||
|
{
|
||||||
|
index light_index;
|
||||||
|
light_index.lemma = word;
|
||||||
|
|
||||||
|
typedef std::vector<index> vi;
|
||||||
|
std::pair<vi::const_iterator,vi::const_iterator> bounds =
|
||||||
|
std::equal_range(index_list.begin(), index_list.end(), light_index);
|
||||||
|
|
||||||
|
return bounds;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<std::vector<index>::iterator, std::vector<index>::iterator>
|
||||||
|
wordnet::get_indexes(const std::string& word)
|
||||||
|
{
|
||||||
|
index light_index;
|
||||||
|
light_index.lemma = word;
|
||||||
|
|
||||||
|
typedef std::vector<index> vi;
|
||||||
|
std::pair<vi::iterator,vi::iterator> bounds =
|
||||||
|
std::equal_range(index_list.begin(), index_list.end(), light_index);
|
||||||
|
|
||||||
|
return bounds;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string
|
||||||
|
wordnet::wordbase(const std::string& word, int ender)
|
||||||
|
{
|
||||||
|
if (ext::ends_with(word, info.sufx[ender]))
|
||||||
|
{
|
||||||
|
int sufxlen = info.sufx[ender].size();
|
||||||
|
std::string strOut = word.substr(0, word.size() - sufxlen);
|
||||||
|
if (!info.addr[ender].empty())
|
||||||
|
strOut += info.addr[ender];
|
||||||
|
return strOut;
|
||||||
|
}
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_defined(const std::string& word, pos_t pos)
|
||||||
|
{
|
||||||
|
// hack FIXME: Some verbs are built with -e suffix ('builde' is just an example).
|
||||||
|
if (pos == V && word == "builde")
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to find baseform (lemma) of individual word in POS
|
||||||
|
std::string
|
||||||
|
wordnet::morphword(const std::string& word, pos_t pos)
|
||||||
|
{
|
||||||
|
// first look for word on exception list
|
||||||
|
exc_t::iterator it = exc[pos].find(word);
|
||||||
|
if (it != exc[pos].end())
|
||||||
|
return it->second; // found in exception list
|
||||||
|
|
||||||
|
std::string tmpbuf;
|
||||||
|
std::string end;
|
||||||
|
int cnt = 0;
|
||||||
|
|
||||||
|
if (pos == R)
|
||||||
|
return ""; // Only use exception list for adverbs
|
||||||
|
|
||||||
|
if (pos == N)
|
||||||
|
{
|
||||||
|
if (ext::ends_with(word, "ful"))
|
||||||
|
{
|
||||||
|
cnt = word.size() - 3;
|
||||||
|
tmpbuf = word.substr(0, cnt);
|
||||||
|
end = "ful";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// check for noun ending with 'ss' or short words
|
||||||
|
if (ext::ends_with(word, "ss") || word.size() <= 2)
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If not in exception list, try applying rules from tables
|
||||||
|
|
||||||
|
if (tmpbuf.size() == 0)
|
||||||
|
tmpbuf = word;
|
||||||
|
|
||||||
|
if (pos != pos_t::UNKNOWN)
|
||||||
|
{
|
||||||
|
int offset = info.offsets[pos];
|
||||||
|
int pos_cnt = info.cnts[pos];
|
||||||
|
|
||||||
|
std::string morphed;
|
||||||
|
for (int i = 0; i < pos_cnt; i++)
|
||||||
|
{
|
||||||
|
morphed = wordbase(tmpbuf, (i + offset));
|
||||||
|
if (morphed != tmpbuf && is_defined(morphed, pos))
|
||||||
|
return morphed + end;
|
||||||
|
}
|
||||||
|
return morphed;
|
||||||
|
}
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // end of namespace wnb
|
113
contrib/wordnet-blast/wnb/core/wordnet.hh
Normal file
113
contrib/wordnet-blast/wnb/core/wordnet.hh
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
# include <iostream>
|
||||||
|
# include <string>
|
||||||
|
# include <cassert>
|
||||||
|
# include <vector>
|
||||||
|
//# include <boost/filesystem.hpp>
|
||||||
|
|
||||||
|
//Possible https://bugs.launchpad.net/ubuntu/+source/boost/+bug/270873
|
||||||
|
# include <boost/graph/graph_traits.hpp>
|
||||||
|
# include <boost/graph/adjacency_list.hpp>
|
||||||
|
|
||||||
|
# include "load_wordnet.hh"
|
||||||
|
# include "pos_t.hh"
|
||||||
|
|
||||||
|
namespace wnb
|
||||||
|
{
|
||||||
|
|
||||||
|
/// More info here: http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html
|
||||||
|
|
||||||
|
struct info_helper;
|
||||||
|
|
||||||
|
/// Synset
|
||||||
|
struct synset
|
||||||
|
{
|
||||||
|
int lex_filenum;
|
||||||
|
std::size_t w_cnt;
|
||||||
|
std::vector<std::string> words;
|
||||||
|
std::vector<int> lex_ids;
|
||||||
|
std::size_t p_cnt;
|
||||||
|
std::string gloss;
|
||||||
|
|
||||||
|
// extra
|
||||||
|
pos_t pos; ///< pos (replace ss_type)
|
||||||
|
int id; ///< unique identifier (replace synset_offset)
|
||||||
|
int sense_number; ///< http://wordnet.princeton.edu/man/senseidx.5WN.html
|
||||||
|
std::vector<std::pair<std::string, int> > tag_cnts; ///< http://wordnet.princeton.edu/man/senseidx.5WN.html
|
||||||
|
|
||||||
|
bool operator==(const synset& s) const { return (id == s.id); }
|
||||||
|
bool operator<(const synset& s) const { return (id < s.id); }
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/// Rel between synsets properties
|
||||||
|
struct ptr
|
||||||
|
{
|
||||||
|
//std::string pointer_symbol; ///< symbol of the relation
|
||||||
|
int pointer_symbol;
|
||||||
|
int source; ///< source word inside synset
|
||||||
|
int target; ///< target word inside synset
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/// Index
|
||||||
|
struct index
|
||||||
|
{
|
||||||
|
std::string lemma;
|
||||||
|
|
||||||
|
std::size_t synset_cnt;
|
||||||
|
std::size_t p_cnt;
|
||||||
|
std::size_t sense_cnt;
|
||||||
|
float tagsense_cnt;
|
||||||
|
std::vector<std::string> ptr_symbols;
|
||||||
|
std::vector<int> synset_offsets;
|
||||||
|
|
||||||
|
// extra
|
||||||
|
std::vector<int> synset_ids;
|
||||||
|
pos_t pos;
|
||||||
|
|
||||||
|
bool operator<(const index& b) const
|
||||||
|
{
|
||||||
|
return (lemma.compare(b.lemma) < 0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/// Wordnet interface class
|
||||||
|
struct wordnet
|
||||||
|
{
|
||||||
|
typedef boost::adjacency_list<boost::vecS, boost::vecS,
|
||||||
|
boost::directedS,
|
||||||
|
synset, ptr> graph; ///< boost graph type
|
||||||
|
|
||||||
|
/// Constructor
|
||||||
|
wordnet(const std::string& wordnet_dir, bool verbose=false);
|
||||||
|
|
||||||
|
/// Return synsets matching word
|
||||||
|
std::vector<synset> get_synsets(const std::string& word, pos_t pos = pos_t::UNKNOWN);
|
||||||
|
//FIXME: todo
|
||||||
|
std::vector<synset> get_synset(const std::string& word, char pos, int i);
|
||||||
|
// added
|
||||||
|
const std::vector<std::string> * get_synset(const std::string& word, pos_t pos = pos_t::UNKNOWN) const;
|
||||||
|
|
||||||
|
std::pair<std::vector<index>::iterator, std::vector<index>::iterator>
|
||||||
|
get_indexes(const std::string& word);
|
||||||
|
|
||||||
|
std::pair<std::vector<index>::const_iterator, std::vector<index>::const_iterator>
|
||||||
|
get_indexes_const(const std::string& word) const;
|
||||||
|
|
||||||
|
std::string wordbase(const std::string& word, int ender);
|
||||||
|
|
||||||
|
std::string morphword(const std::string& word, pos_t pos);
|
||||||
|
|
||||||
|
std::vector<index> index_list; ///< index list // FIXME: use a map
|
||||||
|
graph wordnet_graph; ///< synsets graph
|
||||||
|
info_helper info; ///< helper object
|
||||||
|
bool _verbose;
|
||||||
|
|
||||||
|
typedef std::map<std::string,std::string> exc_t;
|
||||||
|
std::map<pos_t, exc_t> exc;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // end of namespace wnb
|
180
contrib/wordnet-blast/wnb/main.cc
Normal file
180
contrib/wordnet-blast/wnb/main.cc
Normal file
@ -0,0 +1,180 @@
|
|||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
#include <boost/progress.hpp>
|
||||||
|
#include <boost/algorithm/string.hpp>
|
||||||
|
|
||||||
|
#include <wnb/core/wordnet.hh>
|
||||||
|
#include <wnb/core/load_wordnet.hh>
|
||||||
|
#include <wnb/core/info_helper.hh>
|
||||||
|
#include <wnb/nltk_similarity.hh>
|
||||||
|
#include <wnb/std_ext.hh>
|
||||||
|
|
||||||
|
using namespace wnb;
|
||||||
|
using namespace boost;
|
||||||
|
using namespace boost::algorithm;
|
||||||
|
|
||||||
|
bool usage(int argc, char ** argv)
|
||||||
|
{
|
||||||
|
std::string dir;
|
||||||
|
if (argc >= 2)
|
||||||
|
dir = std::string(argv[1]);
|
||||||
|
if (argc != 3 || dir[dir.length()-1] != '/')
|
||||||
|
{
|
||||||
|
std::cout << argv[0] << " .../wordnet_dir/ word_list_file" << std::endl;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ws
|
||||||
|
{
|
||||||
|
std::string w;
|
||||||
|
float s;
|
||||||
|
|
||||||
|
bool operator<(const ws& a) const {return s > a.s;}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/// Compute similarity of word with words in word list
|
||||||
|
std::vector<ws>
|
||||||
|
compute_similarities(wordnet& wn,
|
||||||
|
const std::string& word,
|
||||||
|
const std::vector<std::string>& word_list)
|
||||||
|
{
|
||||||
|
std::vector<ws> wslist;
|
||||||
|
std::vector<synset> synsets1 = wn.get_synsets(word);
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < synsets1.size(); i++)
|
||||||
|
for (unsigned k = 0; k < synsets1[i].words.size(); k++)
|
||||||
|
std::cout << " - " << synsets1[i].words[k] << std::endl;
|
||||||
|
|
||||||
|
nltk_similarity path_similarity(wn);
|
||||||
|
{
|
||||||
|
progress_timer t;
|
||||||
|
progress_display show_progress(word_list.size());
|
||||||
|
|
||||||
|
for (unsigned k = 0; k < word_list.size(); k++)
|
||||||
|
{
|
||||||
|
const std::string& w = word_list[k];
|
||||||
|
float max = 0;
|
||||||
|
std::vector<synset> synsets2 = wn.get_synsets(w);
|
||||||
|
for (unsigned i = 0; i < synsets1.size(); i++)
|
||||||
|
{
|
||||||
|
for (unsigned j = 0; j < synsets2.size(); j++)
|
||||||
|
{
|
||||||
|
float s = path_similarity(synsets1[i], synsets2[j], 6);
|
||||||
|
if (s > max)
|
||||||
|
max = s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ws e = {w, max};
|
||||||
|
wslist.push_back(e);
|
||||||
|
++show_progress;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return wslist;
|
||||||
|
}
|
||||||
|
|
||||||
|
void similarity_test(wordnet& wn,
|
||||||
|
const std::string& word,
|
||||||
|
std::vector<std::string>& word_list)
|
||||||
|
{
|
||||||
|
std::vector<ws> wslist = compute_similarities(wn, word, word_list);
|
||||||
|
|
||||||
|
std::stable_sort(wslist.begin(), wslist.end());
|
||||||
|
for (unsigned i = 0; i < std::min(wslist.size(), size_t(10)); i++)
|
||||||
|
std::cout << wslist[i].w << " " << wslist[i].s << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
void print_synsets(pos_t pos, wnb::index& idx, wordnet& wn)
|
||||||
|
{
|
||||||
|
std::string& mword = idx.lemma;
|
||||||
|
std::cout << "\nOverview of " << get_name_from_pos(pos) << " " << mword << "\n\n";
|
||||||
|
std::cout << "The " << get_name_from_pos(pos) << " " << mword << " has "
|
||||||
|
<< idx.synset_ids.size() << ((idx.synset_ids.size() == 1) ? " sense": " senses");
|
||||||
|
|
||||||
|
if (idx.tagsense_cnt != 0)
|
||||||
|
std::cout << " (first " << idx.tagsense_cnt << " from tagged texts)";
|
||||||
|
else
|
||||||
|
std::cout << " (no senses from tagged texts)";
|
||||||
|
|
||||||
|
std::cout << "\n";
|
||||||
|
std::cout << " \n";
|
||||||
|
|
||||||
|
for (std::size_t i = 0; i < idx.synset_ids.size(); i++)
|
||||||
|
{
|
||||||
|
int id = idx.synset_ids[i];
|
||||||
|
const synset& synset = wn.wordnet_graph[id];
|
||||||
|
|
||||||
|
std::cout << i+1 << ". ";
|
||||||
|
for (std::size_t k = 0; k < synset.tag_cnts.size(); k++)
|
||||||
|
{
|
||||||
|
if (synset.tag_cnts[k].first == mword)
|
||||||
|
std::cout << "(" << synset.tag_cnts[k].second << ") ";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> nwords;
|
||||||
|
for (auto& w : synset.words)
|
||||||
|
nwords.push_back((pos == A) ? w.substr(0, w.find_first_of("(")) : w);
|
||||||
|
|
||||||
|
std::cout << replace_all_copy(join(nwords, ", "), "_", " ");
|
||||||
|
std::cout << " -- (" << trim_copy(synset.gloss) << ")";
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void wn_like(wordnet& wn, const std::string& word, pos_t pos)
|
||||||
|
{
|
||||||
|
if (word == "")
|
||||||
|
return;
|
||||||
|
|
||||||
|
typedef std::vector<wnb::index> vi;
|
||||||
|
std::pair<vi::iterator,vi::iterator> bounds = wn.get_indexes(word);
|
||||||
|
|
||||||
|
for (vi::iterator it = bounds.first; it != bounds.second; it++)
|
||||||
|
{
|
||||||
|
if (pos != -1 && it->pos == pos)
|
||||||
|
{
|
||||||
|
print_synsets(pos, *it, wn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void batch_test(wordnet& wn, std::vector<std::string>& word_list)
|
||||||
|
{
|
||||||
|
for (std::size_t i = 0; i < word_list.size(); i++)
|
||||||
|
{
|
||||||
|
for (unsigned p = 1; p < POS_ARRAY_SIZE; p++)
|
||||||
|
{
|
||||||
|
pos_t pos = (pos_t) p;
|
||||||
|
|
||||||
|
wn_like(wn, word_list[i], pos);
|
||||||
|
std::string mword = wn.morphword(word_list[i], pos);
|
||||||
|
if (mword != word_list[i])
|
||||||
|
wn_like(wn, mword, pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv)
|
||||||
|
{
|
||||||
|
if (usage(argc, argv))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
// read command line
|
||||||
|
std::string wordnet_dir = argv[1];
|
||||||
|
std::string test_file = argv[2];
|
||||||
|
|
||||||
|
wordnet wn(wordnet_dir);
|
||||||
|
|
||||||
|
// read test file
|
||||||
|
std::string list = ext::read_file(test_file);
|
||||||
|
std::vector<std::string> wl = ext::split(list);
|
||||||
|
|
||||||
|
batch_test(wn, wl);
|
||||||
|
}
|
||||||
|
|
146
contrib/wordnet-blast/wnb/nltk_similarity.hh
Normal file
146
contrib/wordnet-blast/wnb/nltk_similarity.hh
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
#ifndef _NLTK_SIMILARITY_HH
|
||||||
|
# define _NLTK_SIMILARITY_HH
|
||||||
|
|
||||||
|
# include <queue>
|
||||||
|
# include <boost/graph/filtered_graph.hpp>
|
||||||
|
# include <wnb/core/wordnet.hh>
|
||||||
|
|
||||||
|
namespace wnb
|
||||||
|
{
|
||||||
|
namespace internal
|
||||||
|
{
|
||||||
|
|
||||||
|
//Helper class filtering out other than hypernym relations
|
||||||
|
template <typename PointerSymbolMap>
|
||||||
|
struct hyper_edge
|
||||||
|
{
|
||||||
|
hyper_edge() { }
|
||||||
|
|
||||||
|
hyper_edge(PointerSymbolMap pointer_symbol)
|
||||||
|
: m_pointer_symbol(pointer_symbol) { }
|
||||||
|
|
||||||
|
template <typename Edge>
|
||||||
|
bool operator()(const Edge& e) const
|
||||||
|
{
|
||||||
|
int p_s = get(m_pointer_symbol, e);
|
||||||
|
return p_s == 1; // hypernyme (instance_hypernyme not used here)
|
||||||
|
}
|
||||||
|
|
||||||
|
PointerSymbolMap m_pointer_symbol;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // end of anonymous namespace
|
||||||
|
|
||||||
|
|
||||||
|
class nltk_similarity
|
||||||
|
{
|
||||||
|
|
||||||
|
typedef boost::property_map<wordnet::graph,
|
||||||
|
int ptr::*>::type PointerSymbolMap;
|
||||||
|
typedef boost::filtered_graph<wordnet::graph,
|
||||||
|
internal::hyper_edge<PointerSymbolMap> > G;
|
||||||
|
typedef boost::graph_traits<G>::vertex_descriptor vertex;
|
||||||
|
|
||||||
|
internal::hyper_edge<PointerSymbolMap> filter;
|
||||||
|
G fg;
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
nltk_similarity(wordnet& wn)
|
||||||
|
: filter(get(&ptr::pointer_symbol, wn.wordnet_graph)),
|
||||||
|
fg(wn.wordnet_graph, filter)
|
||||||
|
{ }
|
||||||
|
|
||||||
|
/// Get list of hypernyms of s along with distance to s
|
||||||
|
std::map<vertex, int> hypernym_map(vertex s);
|
||||||
|
|
||||||
|
/// Get shortest path between and synset1 and synset2.
|
||||||
|
int shortest_path_distance(const synset& synset1, const synset& synset2);
|
||||||
|
|
||||||
|
/// return disance
|
||||||
|
float operator()(const synset& synset1, const synset& synset2, int=0);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
std::map<nltk_similarity::vertex, int>
|
||||||
|
nltk_similarity::hypernym_map(nltk_similarity::vertex s)
|
||||||
|
{
|
||||||
|
std::map<vertex, int> map;
|
||||||
|
|
||||||
|
// Python:
|
||||||
|
// for (hypernym in self[HYPERNYM])
|
||||||
|
// distances |= hypernym.hypernym_distances(distance+1);
|
||||||
|
|
||||||
|
boost::graph_traits<G>::out_edge_iterator e, e_end;
|
||||||
|
std::queue<vertex> q;
|
||||||
|
|
||||||
|
q.push(s);
|
||||||
|
map[s] = 0;
|
||||||
|
while (!q.empty())
|
||||||
|
{
|
||||||
|
vertex u = q.front(); q.pop();
|
||||||
|
|
||||||
|
int new_d = map[u] + 1;
|
||||||
|
for (boost::tuples::tie(e, e_end) = out_edges(u, fg); e != e_end; ++e)
|
||||||
|
{
|
||||||
|
vertex v = target(*e,fg);
|
||||||
|
q.push(v);
|
||||||
|
|
||||||
|
if (map.find(v) != map.end())
|
||||||
|
{
|
||||||
|
if (new_d < map[v])
|
||||||
|
map[v] = new_d;
|
||||||
|
else
|
||||||
|
q.pop();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
map[v] = new_d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int
|
||||||
|
nltk_similarity::shortest_path_distance(const synset& synset1, const synset& synset2)
|
||||||
|
{
|
||||||
|
vertex v1 = synset1.id;
|
||||||
|
vertex v2 = synset2.id;
|
||||||
|
|
||||||
|
std::map<vertex, int> map1 = hypernym_map(v1);
|
||||||
|
std::map<vertex, int> map2 = hypernym_map(v2);
|
||||||
|
|
||||||
|
// For each ancestor synset common to both subject synsets, find the
|
||||||
|
// connecting path length. Return the shortest of these.
|
||||||
|
|
||||||
|
int path_distance = -1;
|
||||||
|
std::map<vertex, int>::iterator it, it2;
|
||||||
|
for (it = map1.begin(); it != map1.end(); it++)
|
||||||
|
for (it2 = map2.begin(); it2 != map2.end(); it2++)
|
||||||
|
if (fg[it->first] == fg[it2->first])
|
||||||
|
{
|
||||||
|
int new_distance = it->second + it2->second;
|
||||||
|
if (path_distance < 0 || new_distance < path_distance)
|
||||||
|
path_distance = new_distance;
|
||||||
|
}
|
||||||
|
|
||||||
|
return path_distance;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
float
|
||||||
|
nltk_similarity::operator()(const synset& synset1, const synset& synset2, int)
|
||||||
|
{
|
||||||
|
int distance = shortest_path_distance(synset1, synset2);
|
||||||
|
if (distance >= 0)
|
||||||
|
return 1. / (distance + 1);
|
||||||
|
else
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
} // end of namespace wnb
|
||||||
|
|
||||||
|
#endif /* _NLTK_SIMILARITY_HH */
|
||||||
|
|
90
contrib/wordnet-blast/wnb/std_ext.hh
Normal file
90
contrib/wordnet-blast/wnb/std_ext.hh
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
#ifndef _STD_EXT_HH
|
||||||
|
# define _STD_EXT_HH
|
||||||
|
|
||||||
|
# include <string>
|
||||||
|
# include <sstream>
|
||||||
|
# include <fstream>
|
||||||
|
# include <algorithm>
|
||||||
|
# include <stdexcept>
|
||||||
|
|
||||||
|
namespace ext
|
||||||
|
{
|
||||||
|
/// Read a file, return the content as a C++ string
|
||||||
|
inline
|
||||||
|
std::string read_file(const std::string& fn)
|
||||||
|
{
|
||||||
|
std::ifstream is;
|
||||||
|
is.open(fn.c_str(), std::ios::binary);
|
||||||
|
if (!is.is_open())
|
||||||
|
throw std::runtime_error("File not found: " + fn);
|
||||||
|
|
||||||
|
std::string str((std::istreambuf_iterator<char>(is)),
|
||||||
|
std::istreambuf_iterator<char>());
|
||||||
|
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Split a std::string
|
||||||
|
inline
|
||||||
|
std::vector<std::string> split(const std::string& str)
|
||||||
|
{
|
||||||
|
std::vector<std::string> tokens;
|
||||||
|
std::istringstream iss(str);
|
||||||
|
copy(std::istream_iterator<std::string>(iss),
|
||||||
|
std::istream_iterator<std::string>(),
|
||||||
|
std::back_inserter< std::vector<std::string> >(tokens));
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Split a std::string on separator
|
||||||
|
inline
|
||||||
|
std::vector<std::string> split(const std::string& s, char seperator)
|
||||||
|
{
|
||||||
|
std::vector<std::string> output;
|
||||||
|
std::string::size_type prev_pos = 0, pos = 0;
|
||||||
|
|
||||||
|
while((pos = s.find(seperator, pos)) != std::string::npos)
|
||||||
|
{
|
||||||
|
std::string substring( s.substr(prev_pos, pos-prev_pos) );
|
||||||
|
output.push_back(substring);
|
||||||
|
prev_pos = ++pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
output.push_back(s.substr(prev_pos, pos-prev_pos));
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline
|
||||||
|
bool
|
||||||
|
ends_with(const std::string& str, const std::string& ending)
|
||||||
|
{
|
||||||
|
if (str.length() >= ending.length())
|
||||||
|
{
|
||||||
|
int cmp = str.compare(str.length() - ending.length(),
|
||||||
|
ending.length(), ending);
|
||||||
|
return (0 == cmp);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Sorted unique
|
||||||
|
template <typename T>
|
||||||
|
inline
|
||||||
|
T s_unique(T& v)
|
||||||
|
{
|
||||||
|
T out;
|
||||||
|
|
||||||
|
std::sort(v.begin(), v.end());
|
||||||
|
typename T::iterator last = std::unique(v.begin(),v.end());
|
||||||
|
|
||||||
|
out.resize(last - v.begin());
|
||||||
|
std::copy(v.begin(), last, out.begin());
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // end of ext
|
||||||
|
|
||||||
|
#endif /* _STD_EXT_HH */
|
||||||
|
|
@ -124,3 +124,4 @@ endif()
|
|||||||
set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-sanitize=signed-integer-overflow)
|
set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-sanitize=signed-integer-overflow)
|
||||||
|
|
||||||
target_link_libraries(clickhouse_functions PRIVATE stemmer)
|
target_link_libraries(clickhouse_functions PRIVATE stemmer)
|
||||||
|
target_link_libraries(clickhouse_functions PRIVATE wnb)
|
@ -1,10 +1,11 @@
|
|||||||
#include <Common/Exception.h>
|
#include <Common/Exception.h>
|
||||||
#include <Interpreters/SynonymsExtensions.h>
|
#include <Functions/SynonymsExtensions.h>
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <list>
|
#include <list>
|
||||||
|
|
||||||
#include <boost/algorithm/string.hpp>
|
#include <boost/algorithm/string.hpp>
|
||||||
|
#include <wnb/core/wordnet.hh>
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
@ -48,7 +49,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Synset * getSynonyms(const std::string_view & token) const override
|
const Synset * getSynonyms(const std::string_view & token) const override
|
||||||
{
|
{
|
||||||
auto it = table.find(token);
|
auto it = table.find(token);
|
||||||
|
|
||||||
@ -62,20 +63,23 @@ public:
|
|||||||
class WordnetSynonymsExtension : public ISynonymsExtension
|
class WordnetSynonymsExtension : public ISynonymsExtension
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
// std::vector<std::vector<String>> data;
|
wnb::wordnet wn;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
WordnetSynonymsExtension(const String & /*path*/)
|
WordnetSynonymsExtension(const String & path) : wn(path) {}
|
||||||
{
|
|
||||||
|
|
||||||
}
|
const Synset * getSynonyms(const std::string_view & token) const override
|
||||||
|
|
||||||
Synset * getSynonyms(const std::string_view & /*token*/) const override
|
|
||||||
{
|
{
|
||||||
return nullptr;
|
return wn.get_synset(std::string(token));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// Duplicate of code from StringUtils.h. Copied here for less dependencies.
|
||||||
|
static bool startsWith(const std::string & s, const char * prefix)
|
||||||
|
{
|
||||||
|
return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix));
|
||||||
|
}
|
||||||
|
|
||||||
SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration & config)
|
SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration & config)
|
||||||
{
|
{
|
||||||
String prefix = "synonyms_extensions";
|
String prefix = "synonyms_extensions";
|
||||||
@ -89,7 +93,7 @@ SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration &
|
|||||||
|
|
||||||
for (const auto & key : keys)
|
for (const auto & key : keys)
|
||||||
{
|
{
|
||||||
if (key == "extension")
|
if (startsWith(key, "extension"))
|
||||||
{
|
{
|
||||||
const auto & ext_name = config.getString(prefix + "." + key + ".name", "");
|
const auto & ext_name = config.getString(prefix + "." + key + ".name", "");
|
||||||
const auto & ext_path = config.getString(prefix + "." + key + ".path", "");
|
const auto & ext_path = config.getString(prefix + "." + key + ".path", "");
|
@ -19,7 +19,7 @@ public:
|
|||||||
|
|
||||||
//ISynonymsExtension(const String & path);
|
//ISynonymsExtension(const String & path);
|
||||||
|
|
||||||
virtual Synset * getSynonyms(const std::string_view & token) const = 0;
|
virtual const Synset * getSynonyms(const std::string_view & token) const = 0;
|
||||||
|
|
||||||
virtual ~ISynonymsExtension() = default;
|
virtual ~ISynonymsExtension() = default;
|
||||||
};
|
};
|
@ -6,7 +6,7 @@
|
|||||||
#include <Functions/FunctionFactory.h>
|
#include <Functions/FunctionFactory.h>
|
||||||
#include <Functions/FunctionHelpers.h>
|
#include <Functions/FunctionHelpers.h>
|
||||||
#include <Functions/IFunction.h>
|
#include <Functions/IFunction.h>
|
||||||
#include <Interpreters/SynonymsExtensions.h>
|
#include <Functions/SynonymsExtensions.h>
|
||||||
#include <Interpreters/Context.h>
|
#include <Interpreters/Context.h>
|
||||||
|
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
|
@ -53,7 +53,6 @@
|
|||||||
#include <Interpreters/InterserverCredentials.h>
|
#include <Interpreters/InterserverCredentials.h>
|
||||||
#include <Interpreters/Cluster.h>
|
#include <Interpreters/Cluster.h>
|
||||||
#include <Interpreters/InterserverIOHandler.h>
|
#include <Interpreters/InterserverIOHandler.h>
|
||||||
#include <Interpreters/SynonymsExtensions.h>
|
|
||||||
#include <Interpreters/SystemLog.h>
|
#include <Interpreters/SystemLog.h>
|
||||||
#include <Interpreters/Context.h>
|
#include <Interpreters/Context.h>
|
||||||
#include <Interpreters/DDLWorker.h>
|
#include <Interpreters/DDLWorker.h>
|
||||||
@ -75,7 +74,7 @@
|
|||||||
#include <Interpreters/DatabaseCatalog.h>
|
#include <Interpreters/DatabaseCatalog.h>
|
||||||
#include <Storages/MergeTree/BackgroundJobsExecutor.h>
|
#include <Storages/MergeTree/BackgroundJobsExecutor.h>
|
||||||
#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
|
#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
|
||||||
|
#include <Functions/SynonymsExtensions.h>
|
||||||
|
|
||||||
namespace ProfileEvents
|
namespace ProfileEvents
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user