added WordNet synonyms extensions

2024-11-21 15:12:02 +00:00 · 2021-06-03 19:28:12 +00:00 · 2021-06-03 19:28:12 +00:00 · ed12fb5604
commit ed12fb5604
parent 876f51ab95
27 changed files with 2544 additions and 16 deletions
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -331,3 +331,4 @@ endif()

 add_subdirectory(fast_float)
 add_subdirectory(libstemmer-c-cmake)
+add_subdirectory(wordnet-blast-cmake)
--- a/contrib/boost-cmake/CMakeLists.txt
+++ b/contrib/boost-cmake/CMakeLists.txt
@ -13,11 +13,12 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
        regex
        context
        coroutine
+        graph
    )

    if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND
        Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY AND
-        Boost_COROUTINE_LIBRARY)
+        Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY)

        set(EXTERNAL_BOOST_FOUND 1)

@ -32,6 +33,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
        add_library (_boost_system INTERFACE)
        add_library (_boost_context INTERFACE)
        add_library (_boost_coroutine INTERFACE)
+        add_library (_boost_graph INTERFACE)

        target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY})
        target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY})
@ -40,6 +42,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
        target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY})
        target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY})
        target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY})
+        target_link_libraries (_boost_graph INTERFACE ${Boost_GRAPH_LIBRARY})

        add_library (boost::filesystem ALIAS _boost_filesystem)
        add_library (boost::iostreams ALIAS _boost_iostreams)
@ -48,6 +51,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
        add_library (boost::system ALIAS _boost_system)
        add_library (boost::context ALIAS _boost_context)
        add_library (boost::coroutine ALIAS _boost_coroutine)
+        add_library (boost::graph ALIAS _boost_graph)
    else()
        set(EXTERNAL_BOOST_FOUND 0)
        message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost")
@ -221,4 +225,16 @@ if (NOT EXTERNAL_BOOST_FOUND)
    add_library (boost::coroutine ALIAS _boost_coroutine)
    target_include_directories (_boost_coroutine PRIVATE ${LIBRARY_DIR})
    target_link_libraries(_boost_coroutine PRIVATE _boost_context)
+
+    # graph
+
+    set (SRCS_GRAPH
+        "${LIBRARY_DIR}/libs/graph/src/graphml.cpp"
+        "${LIBRARY_DIR}/libs/graph/src/read_graphviz_new.cpp"
+    )
+
+    add_library (_boost_graph ${SRCS_GRAPH})
+    add_library (boost::graph ALIAS _boost_graph)
+    target_include_directories (_boost_graph PRIVATE ${LIBRARY_DIR})
+
 endif ()
--- a/contrib/wordnet-blast-cmake/CMakeLists.txt
+++ b/contrib/wordnet-blast-cmake/CMakeLists.txt
@ -0,0 +1,13 @@
+set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast")
+
+set(SRCS
+    "${LIBRARY_DIR}/wnb/core/info_helper.cc"
+    "${LIBRARY_DIR}/wnb/core/load_wordnet.cc"
+    "${LIBRARY_DIR}/wnb/core/wordnet.cc"
+)
+
+add_library(wnb ${SRCS})
+
+target_link_libraries(wnb PRIVATE boost::headers_only boost::graph)
+
+target_include_directories(wnb PUBLIC "${LIBRARY_DIR}")
--- a/contrib/wordnet-blast/AUTHORS
+++ b/contrib/wordnet-blast/AUTHORS
@ -0,0 +1 @@
+Ugo Jardonnet ugo.jardonnet/gmail
--- a/contrib/wordnet-blast/CMakeLists.txt
+++ b/contrib/wordnet-blast/CMakeLists.txt
@ -0,0 +1,65 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
+
+PROJECT(wnb)
+
+# Boost dependency
+#--------------------------------------------------
+
+# IF (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+#   SET (BOOST_ROOT /Developer/boost_build/) # Suggested path
+# ELSE()
+#   SET (BOOST_ROOT "/usr/include")
+# ENDIF()
+##############
+SET (BOOST_ROOT "${ClickHouse_SOURCE_DIR}/contrib/boost")
+##############
+MESSAGE(STATUS "** Search Boost root: ${BOOST_ROOT}")
+FIND_PACKAGE(Boost 1.70.0 COMPONENTS graph REQUIRED)
+MESSAGE(STATUS "** Boost Include: ${Boost_INCLUDE_DIR}")
+MESSAGE(STATUS "** Boost Libraries: ${Boost_LIBRARY_DIRS}")
+MESSAGE(STATUS "** Boost Libraries: ${Boost_LIBRARIES}")
+
+INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIR})
+LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
+
+# Project
+#--------------------------------------------------
+
+LINK_DIRECTORIES(${wnb_SOURCE_DIR}/lib)
+INCLUDE_DIRECTORIES(${PROJECT_BINARY_DIR})
+
+SET(PROJECT_VERSION "0.6")
+SET(ARCHIVE_NAME ${CMAKE_PROJECT_NAME}-${PROJECT_VERSION})
+
+ADD_CUSTOM_TARGET(dist
+  COMMAND git archive --prefix=${ARCHIVE_NAME}/ HEAD
+  | bzip2 > ${CMAKE_BINARY_DIR}/${ARCHIVE_NAME}.tar.bz2
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
+
+ADD_CUSTOM_TARGET(check
+  COMMAND ./check/check.sh ./check/list.txt)
+
+
+## Compiler flags
+IF (CMAKE_COMPILER_IS_GNUCXX)
+  list(APPEND CMAKE_CXX_FLAGS " --std=c++11 -O3 -DNDEBUG -Wall -Wextra")
+  #list(APPEND CMAKE_CXX_FLAGS " -g -Wall -Wextra")
+ENDIF()
+
+SET(WNB_SRCS wnb/core/wordnet.cc
+  wnb/core/load_wordnet.cc wnb/core/info_helper.cc)
+
+# Executable
+#--------------------------------------------------
+ADD_EXECUTABLE (wntest wnb/main.cc ${WNB_SRCS})
+SET(EXECUTABLE_OUTPUT_PATH ${wnb_BINARY_DIR}/bin)
+
+# Static library
+#--------------------------------------------------
+ADD_LIBRARY(wnb ${WNB_SRCS})
+SET(LIBRARY_OUTPUT_PATH ${wnb_BINARY_DIR}/lib)
+
+IF (Boost_FOUND)
+  TARGET_LINK_LIBRARIES(wntest ${Boost_LIBRARIES})
+  TARGET_LINK_LIBRARIES(wnb ${Boost_LIBRARIES})
+ENDIF()
--- a/contrib/wordnet-blast/README
+++ b/contrib/wordnet-blast/README
@ -0,0 +1,43 @@
+
+=====================================================================
+          WordNet Blast
+=====================================================================
+
+In memory access to the wordnet onthology.
+
+DEPENDENCIES:
+        boost 1.46
+        wordnet-sense-index
+        colordiff (for wntest)
+
+INSTALL:
+        cmake CMakeLists.txt
+        make
+
+TESTS: (Beta)
+        make check
+
+USAGE:
+        #include "wordnet.hh"
+        #include "wnb/nltk_similarity.hh"
+
+        using namespace std;
+        using namespace wnb;
+
+        int main()
+        {
+            wordnet wn(PATH_TO_WORDNET);
+
+            vector<synset> synsets1 = wn.get_synsets("cat");
+            vector<synset> synsets2 = wn.get_synsets("dog");
+
+            nltk_similarity similarity(wn);
+            float d = similarity(synsets1[0], synsets2[0], 6);
+        }
+
+BUGS:
+        - Word Morphing is sometimes incorrect.
+
+REFERENCE:
+        George A. Miller (1995). WordNet: A Lexical Database for English.
+        Communications of the ACM Vol. 38, No. 11: 39-41.
--- a/contrib/wordnet-blast/WORDNET_LICENSE
+++ b/contrib/wordnet-blast/WORDNET_LICENSE
@ -0,0 +1,25 @@
+This license is available as the file LICENSE in any downloaded version of
+WordNet.
+
+WordNet Release 3.0
+
+This software and database is being provided to you, the LICENSEE, by Princeton
+University under the following license.  By obtaining, using and/or copying this
+software and database, you agree that you have read, understood, and will comply
+with these terms and conditions.: Permission to use, copy, modify and distribute
+this software and database and its documentation for any purpose and without fee
+or royalty is hereby granted, provided that you agree to comply with the
+following copyright notice and statements, including the disclaimer, and that
+the same appear on ALL copies of the software, database and documentation,
+including modifications that you make for internal use or for distribution.
+WordNet 3.0 Copyright 2006 by Princeton University.  All rights reserved.  THIS
+SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO
+REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT
+LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF
+MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE
+LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY
+PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton
+University or Princeton may not be used in advertising or publicity pertaining
+to distribution of the software and/or database.  Title to copyright in this
+software, database and any associated documentation shall at all times remain
+with Princeton University and LICENSEE agrees to preserve same.
--- a/contrib/wordnet-blast/changelog
+++ b/contrib/wordnet-blast/changelog
@ -0,0 +1,11 @@
+ * 0.6
+	- Improve tests
+	- get_synsets by pos
+	- Load wordnet a bit faster
+	- Fix build on Mac Os (thanks to Roman Kutlak)
+	- Update doc
+	- Improve testing
+ * 0.5
+	- get_synsets
+	 with morphing partially implemented (thanks to Yaron Feigin)
+	- sense similarity
--- a/contrib/wordnet-blast/check/biglist.txt
+++ b/contrib/wordnet-blast/check/biglist.txt
@ -0,0 +1,852 @@
+a
+able
+about
+account
+acid
+across
+act
+addition
+adjustment
+advertisement
+after
+again
+against
+agreement
+air
+all
+almost
+among
+amount
+amusement
+and
+angle
+angry
+animal
+answer
+ant
+any
+apparatus
+apple
+approval
+arch
+argument
+arm
+army
+art
+as
+at
+attack
+attempt
+attention
+attraction
+authority
+automatic
+awake
+baby
+back
+bad
+bag
+balance
+ball
+band
+base
+basin
+basket
+bath
+be
+beautiful
+because
+bed
+bee
+before
+behaviour
+belief
+bell
+bent
+berry
+between
+bird
+birth
+bit
+bite
+bitter
+black
+blade
+blood
+blow
+blue
+board
+boat
+body
+boiling
+bone
+book
+boot
+bottle
+box
+boy
+brain
+brake
+branch
+brass
+bread
+breath
+brick
+bridge
+bright
+broken
+brother
+brown
+brush
+bucket
+building
+bulb
+burn
+burst
+business
+but
+butter
+button
+by
+cake
+camera
+canvas
+card
+care
+carriage
+cart
+cat
+cause
+certain
+chain
+chalk
+chance
+change
+cheap
+cheese
+chemical
+chest
+chief
+chin
+church
+circle
+clean
+clear
+clock
+cloth
+cloud
+coal
+coat
+cold
+collar
+colour
+comb
+come
+comfort
+committee
+common
+company
+comparison
+competition
+complete
+complex
+condition
+connection
+conscious
+control
+cook
+copper
+copy
+cord
+cork
+cotton
+cough
+country
+cover
+cow
+crack
+credit
+crime
+cruel
+crush
+cry
+cup
+cup
+current
+curtain
+curve
+cushion
+damage
+danger
+dark
+daughter
+day
+dead
+dear
+death
+debt
+decision
+deep
+degree
+delicate
+dependent
+design
+desire
+destruction
+detail
+development
+different
+digestion
+direction
+dirty
+discovery
+discussion
+disease
+disgust
+distance
+distribution
+division
+do
+dog
+door
+doubt
+down
+drain
+drawer
+dress
+drink
+driving
+drop
+dry
+dust
+ear
+early
+earth
+east
+edge
+education
+effect
+egg
+elastic
+electric
+end
+engine
+enough
+equal
+error
+even
+event
+ever
+every
+example
+exchange
+existence
+expansion
+experience
+expert
+eye
+face
+fact
+fall
+false
+family
+far
+farm
+fat
+father
+fear
+feather
+feeble
+feeling
+female
+fertile
+fiction
+field
+fight
+finger
+fire
+first
+fish
+fixed
+flag
+flame
+flat
+flight
+floor
+flower
+fly
+fold
+food
+foolish
+foot
+for
+force
+fork
+form
+forward
+fowl
+frame
+free
+frequent
+friend
+from
+front
+fruit
+full
+future
+garden
+general
+get
+girl
+give
+glass
+glove
+go
+goat
+gold
+good
+government
+grain
+grass
+great
+green
+grey
+grip
+group
+growth
+guide
+gun
+hair
+hammer
+hand
+hanging
+happy
+harbour
+hard
+harmony
+hat
+hate
+have
+he
+head
+healthy
+hear
+hearing
+heart
+heat
+help
+high
+history
+hole
+hollow
+hook
+hope
+horn
+horse
+hospital
+hour
+house
+how
+humour
+I
+ice
+idea
+if
+ill
+important
+impulse
+in
+increase
+industry
+ink
+insect
+instrument
+insurance
+interest
+invention
+iron
+island
+jelly
+jewel
+join
+journey
+judge
+jump
+keep
+kettle
+key
+kick
+kind
+kiss
+knee
+knife
+knot
+knowledge
+land
+language
+last
+late
+laugh
+law
+lead
+leaf
+learning
+leather
+left
+leg
+let
+letter
+level
+library
+lift
+light
+like
+limit
+line
+linen
+lip
+liquid
+list
+little
+living
+lock
+long
+look
+loose
+loss
+loud
+love
+low
+machine
+make
+male
+man
+manager
+map
+mark
+market
+married
+mass
+match
+material
+may
+meal
+measure
+meat
+medical
+meeting
+memory
+metal
+middle
+military
+milk
+mind
+mine
+minute
+mist
+mixed
+money
+monkey
+month
+moon
+morning
+mother
+motion
+mountain
+mouth
+move
+much
+muscle
+music
+nail
+name
+narrow
+nation
+natural
+near
+necessary
+neck
+need
+needle
+nerve
+net
+new
+news
+night
+no
+noise
+normal
+north
+nose
+not
+note
+now
+number
+nut
+observation
+of
+off
+offer
+office
+oil
+old
+on
+only
+open
+operation
+opinion
+opposite
+or
+orange
+order
+organization
+ornament
+other
+out
+oven
+over
+owner
+page
+pain
+paint
+paper
+parallel
+parcel
+part
+past
+paste
+payment
+peace
+pen
+pencil
+person
+physical
+picture
+pig
+pin
+pipe
+place
+plane
+plant
+plate
+play
+please
+pleasure
+plough
+pocket
+point
+poison
+polish
+political
+poor
+porter
+position
+possible
+pot
+potato
+powder
+power
+present
+price
+print
+prison
+private
+probable
+process
+produce
+profit
+property
+prose
+protest
+public
+pull
+pump
+punishment
+purpose
+push
+put
+quality
+question
+quick
+quiet
+quite
+rail
+rain
+range
+rat
+rate
+ray
+reaction
+reading
+ready
+reason
+receipt
+record
+red
+regret
+regular
+relation
+religion
+representative
+request
+respect
+responsible
+rest
+reward
+rhythm
+rice
+right
+ring
+river
+road
+rod
+roll
+roof
+room
+root
+rough
+round
+rub
+rule
+run
+sad
+safe
+sail
+salt
+same
+sand
+say
+scale
+school
+science
+scissors
+screw
+sea
+seat
+second
+secret
+secretary
+see
+seed
+seem
+selection
+self
+send
+sense
+separate
+serious
+servant
+sex
+shade
+shake
+shame
+sharp
+sheep
+shelf
+ship
+shirt
+shock
+shoe
+short
+shut
+side
+sign
+silk
+silver
+simple
+sister
+size
+skin
+
+skirt
+sky
+sleep
+slip
+slope
+slow
+small
+smash
+smell
+smile
+smoke
+smooth
+snake
+sneeze
+snow
+so
+soap
+society
+sock
+soft
+solid
+some
+
+son
+song
+sort
+sound
+soup
+south
+space
+spade
+special
+sponge
+spoon
+spring
+square
+stage
+stamp
+star
+start
+statement
+station
+steam
+steel
+stem
+step
+stick
+sticky
+stiff
+still
+stitch
+stocking
+stomach
+stone
+stop
+store
+story
+straight
+strange
+street
+stretch
+strong
+structure
+substance
+such
+sudden
+sugar
+suggestion
+summer
+sun
+support
+surprise
+sweet
+swim
+system
+table
+tail
+take
+talk
+tall
+taste
+tax
+teaching
+tendency
+test
+than
+that
+the
+then
+theory
+there
+thick
+thin
+thing
+this
+thought
+thread
+throat
+through
+through
+thumb
+thunder
+ticket
+tight
+till
+time
+tin
+tired
+to
+toe
+together
+tomorrow
+tongue
+tooth
+top
+touch
+town
+trade
+train
+transport
+tray
+tree
+trick
+trouble
+trousers
+true
+turn
+twist
+umbrella
+under
+unit
+up
+use
+value
+verse
+very
+vessel
+view
+violent
+voice
+waiting
+walk
+wall
+war
+warm
+wash
+waste
+watch
+water
+wave
+wax
+way
+weather
+week
+weight
+well
+west
+wet
+wheel
+when
+where
+while
+whip
+whistle
+white
+who
+why
+wide
+will
+wind
+window
+wine
+wing
+winter
+wire
+wise
+with
+woman
+wood
+wool
+word
+work
+worm
+wound
+writing
+wrong
+year
+yellow
+yes
+yesterday
+you
+young
--- a/contrib/wordnet-blast/check/check.sh
+++ b/contrib/wordnet-blast/check/check.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+
+WNHOME=/usr/share/wordnet/
+
+check() {
+    local word_list="$1"
+    echo "./bin/wntest $WNHOME ${word_list}"
+    time ./bin/wntest $WNHOME ${word_list} > ${word_list}.blast
+    echo "for i in \`cat ${word_list}\`; do wn $i -over; done"
+    time for i in `cat ${word_list}`; do wn $i -over; done > ${word_list}.wn
+
+    echo "diff ${word_list}.wn ${word_list}.blast -b"
+    colordiff -y ${word_list}.wn ${word_list}.blast -b
+}
+
+check "$1"
--- a/contrib/wordnet-blast/check/list.txt
+++ b/contrib/wordnet-blast/check/list.txt
@ -0,0 +1,7 @@
+cat
+lions
+city
+building
+salvation
+medications
+haven
--- a/contrib/wordnet-blast/wnb/bfs.hh
+++ b/contrib/wordnet-blast/wnb/bfs.hh
@ -0,0 +1,72 @@
+#ifndef _BFS_HH
+# define _BFS_HH
+
+# include <boost/graph/breadth_first_search.hpp>
+# include <boost/graph/filtered_graph.hpp>
+
+namespace wnb
+{
+  struct synset;
+
+  namespace bfs // breadth first search tools
+  {
+    /// bfs_visitor
+    /// Sum distances and throw answer if target synset found
+    template <typename DistanceMap>
+    class distance_recorder : public boost::default_bfs_visitor
+    {
+    public:
+      distance_recorder(DistanceMap dist, const synset& s, int max)
+        : d(dist), target(s), max_length(max)
+      { }
+
+      template <typename Edge, typename Graph>
+      void tree_edge(Edge e, const Graph& g) const
+      {
+        typename boost::graph_traits<Graph>::vertex_descriptor
+          u = boost::source(e, g), v = boost::target(e, g);
+        d[v] = d[u] + 1;
+
+        if (g[v] == target)
+          throw d[v];
+        if (d[v] > max_length)
+          throw -1;
+      }
+    private:
+      DistanceMap d;
+      const synset& target;
+      int max_length;
+    };
+
+    /// Convenience function
+    template <typename DistanceMap>
+    distance_recorder<DistanceMap>
+    record_distance(DistanceMap d, const synset& s, int m)
+    {
+      return distance_recorder<DistanceMap>(d, s, m);
+    }
+
+    /// This predicate function object determines which edges of the original
+    /// graph will show up in the filtered graph.
+    //FIXME: Do we really need a map here (check cost of property_map construction 
+    // / should be light)
+    template <typename PointerSymbolMap>
+    struct hypo_hyper_edge {
+      hypo_hyper_edge() { }
+      hypo_hyper_edge(PointerSymbolMap pointer_symbol)
+        : m_pointer_symbol(pointer_symbol) { }
+      template <typename Edge>
+      bool operator()(const Edge& e) const {
+        int p_s = get(m_pointer_symbol, e);
+        //see pointer symbol list in info_helper.hh
+        return p_s == 1 || p_s == 2 || p_s == 3 || p_s == 4; 
+      }
+      PointerSymbolMap m_pointer_symbol;
+    };
+
+  } // end of wnb::bfs
+
+} // end of namespace wnb
+
+#endif /* _BFS_HH */
+
--- a/contrib/wordnet-blast/wnb/core/info_helper.cc
+++ b/contrib/wordnet-blast/wnb/core/info_helper.cc
@ -0,0 +1,148 @@
+#include "info_helper.hh"
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <map>
+
+#include <cassert>
+
+namespace wnb
+{
+
+  // Class info_helper
+
+  /// List of pointer symbols
+  const char *
+  info_helper::symbols[info_helper::NB_SYMBOLS] = {
+    "!" ,  // 0 Antonym
+    "@" ,  // 1 Hypernym
+    "@i",  // 2 Instance Hypernym
+    "~" ,  // 3 Hyponym
+    "~i",  // 4 Instance Hyponym
+    "#m",  // 5 Member holonym
+    "#s",  // 6 Substance holonym
+    "#p",  // 7 Part holonym
+    "%m",  // 8 Member meronym
+    "%s",  // 9 Substance meronym
+    "%p",  // 10 Part meronym
+    "=" ,  // 11 Attribute
+    "+" ,  // 12 Derivationally related form
+    ";c",  // 13 Domain of synset - TOPIC
+    "-c",  // 14 Member of this domain - TOPIC
+    ";r",  // 15 Domain of synset - REGION
+    "-r",  // 16 Member of this domain - REGION
+    ";u",  // 17 Domain of synset - USAGE
+    "-u",  // 18 Member of this domain - USAGE
+
+    //The pointer_symbol s for verbs are:
+    "*",   // 19 Entailment
+    ">",   // 20 Cause
+    "^",   // 21 Also see
+    "$",   // 22 Verb Group
+
+    //The pointer_symbol s for adjectives are:
+    "&",   // 23 Similar to
+    "<",   // 24 Participle of verb
+    "\\",  // 25 Pertainym (pertains to noun)
+    "=",   // 26 Attribute
+  };
+
+  const std::string info_helper::sufx[] = {
+    /* Noun suffixes */
+    "s", "ses", "xes", "zes", "ches", "shes", "men", "ies",
+    /* Verb suffixes */
+    "s", "ies", "es", "es", "ed", "ed", "ing", "ing",
+    /* Adjective suffixes */
+    "er", "est", "er", "est"
+  };
+
+  const std::string info_helper::addr[] = {
+    /* Noun endings */
+    "", "s", "x", "z", "ch", "sh", "man", "y",
+    /* Verb endings */
+    "", "y", "e", "", "e", "", "e", "",
+    /* Adjective endings */
+    "", "", "e", "e"
+  };
+
+  const int info_helper::offsets[info_helper::NUMPARTS] = { 0, 0, 8, 16, 0, 0 };
+  const int info_helper::cnts[info_helper::NUMPARTS]    = { 0, 8, 8, 4, 0, 0 };
+
+  void
+  info_helper::update_pos_maps()
+  {
+    // http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3
+
+    indice_offset[UNKNOWN] = 0;
+
+    indice_offset[N] = 0;
+    indice_offset[V] = indice_offset[N] + pos_maps[N].size();
+    indice_offset[A] = indice_offset[V] + pos_maps[V].size();
+    indice_offset[R] = indice_offset[A] + pos_maps[A].size();
+    indice_offset[S] = indice_offset[R] + pos_maps[R].size();
+
+  }
+
+  int info_helper::compute_indice(int offset, pos_t pos)
+  {
+    if (pos == S)
+      pos = A;
+    std::map<int,int>& map = pos_maps[pos];
+
+    assert(pos <= 5 && pos > 0);
+
+    return indice_offset[pos] + map[offset];
+  }
+
+  // Function definitions
+
+  // Return relation between synset indices and offsets
+  static
+  std::map<int,int>
+  preprocess_data(const std::string& fn)
+  {
+    std::map<int,int> map;
+    std::ifstream file(fn.c_str());
+    if (!file.is_open())
+      throw std::runtime_error("preprocess_data: File not found: " + fn);
+
+    std::string row;
+
+    //skip header
+    const unsigned int header_nb_lines = 29;
+    for(std::size_t i = 0; i < header_nb_lines; i++)
+      std::getline(file, row);
+
+    int ind = 0;
+    //parse data line
+    while (std::getline(file, row))
+    {
+      std::stringstream srow(row);
+      int offset;
+      srow >> offset;
+      map.insert(std::pair<int,int>(offset, ind));
+      ind++;
+    }
+
+    file.close();
+    return map;
+  }
+
+  info_helper
+  preprocess_wordnet(const std::string& dn)
+  {
+    info_helper info;
+
+    info.pos_maps[N] = preprocess_data((dn + "data.noun")); // noun_map
+    info.pos_maps[V] = preprocess_data((dn + "data.verb")); // verb_map
+    info.pos_maps[A] = preprocess_data((dn + "data.adj"));  // adj_map
+    info.pos_maps[R] = preprocess_data((dn + "data.adv"));  // adv_map
+
+    info.update_pos_maps();
+
+    return info;
+  }
+
+} // end of namespace wnb
+
--- a/contrib/wordnet-blast/wnb/core/info_helper.hh
+++ b/contrib/wordnet-blast/wnb/core/info_helper.hh
@ -0,0 +1,85 @@
+#pragma once
+
+# include <string>
+# include <stdexcept>
+# include <map>
+
+# include "pos_t.hh"
+
+namespace wnb
+{
+
+  /// Useful information for wordnet in-memory import
+  struct info_helper
+  {
+    /// Symbols' size
+    static const std::size_t NB_SYMBOLS = 27;
+    static const std::size_t NUMPARTS = POS_ARRAY_SIZE;
+
+    /// List of pointer symbols
+    static const char *      symbols[NB_SYMBOLS];
+    static const std::string sufx[];
+    static const std::string addr[];
+
+    static const int  offsets[NUMPARTS];
+    static const int  cnts[NUMPARTS];
+
+    typedef std::map<int,int>       i2of_t;     ///< indice/offset correspondences
+    typedef std::map<pos_t, i2of_t> pos_i2of_t; ///< pos / map  correspondences
+
+    /// Constructor
+    info_helper() { update_pos_maps(); }
+
+    /// Compute the number of synsets (i.e. the number of vertex in the graph)
+    unsigned nb_synsets()
+    {
+      typedef pos_i2of_t::iterator iter_t;
+
+      int sum = 0;
+      for (iter_t it = pos_maps.begin(); it != pos_maps.end(); it++)
+        sum += (*it).second.size();
+
+      return sum;
+      //return adj_map.size() + adv_map.size() + noun_map.size() + verb_map.size();
+    }
+
+    // Given a pos return the starting indice in the graph
+    int get_indice_offset(pos_t pos)
+    {
+      return indice_offset[pos];
+    }
+
+    /// Helper function computing global indice in graph from local offset
+    int compute_indice(int offset, pos_t pos);
+
+    /// Update a map allowing one to get the correct map given a pos
+    void update_pos_maps();
+
+    int get_symbol(const std::string& ps)
+    {
+      for (unsigned i = 0; i < NB_SYMBOLS; i++)
+        if (ps == symbols[i])
+          return i;
+      throw std::runtime_error("Symbol NOT FOUND.");
+    }
+
+    pos_t get_pos(const char& c)
+    {
+      return get_pos_from_char(c);
+    }
+
+  public:
+
+    // i2of_t adj_map;
+    // i2of_t adv_map;
+    // i2of_t noun_map;
+    // i2of_t verb_map;
+
+    pos_i2of_t  pos_maps;
+    std::size_t indice_offset[POS_ARRAY_SIZE];
+  };
+
+  /// Create a new info_help based on wordnet data located in dn (../dict/)
+  info_helper preprocess_wordnet(const std::string& dn);
+
+} // end of namespace wncpp
--- a/contrib/wordnet-blast/wnb/core/load_wordnet.cc
+++ b/contrib/wordnet-blast/wnb/core/load_wordnet.cc
@ -0,0 +1,381 @@
+#include "load_wordnet.hh"
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <algorithm>
+#include <utility>
+
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/progress.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include <wnb/std_ext.hh>
+
+#include "wordnet.hh"
+#include "info_helper.hh"
+#include "pos_t.hh"
+
+namespace bg = boost::graph;
+
+namespace wnb
+{
+
+  namespace
+  {
+
+    // Load synset's words
+    void load_data_row_words(std::stringstream& srow, synset& synset)
+    {
+      srow >> std::hex >> synset.w_cnt >> std::dec;
+      for (std::size_t i = 0; i < synset.w_cnt; i++)
+      {
+        //word lex_id
+
+        std::string word;
+        srow >> word;
+        synset.words.push_back(word);
+
+        int lex_id;
+        srow >> std::hex >> lex_id >> std::dec;
+        synset.lex_ids.push_back(lex_id);
+      }
+    }
+
+    // Add rel to graph
+    void add_wordnet_rel(std::string& pointer_symbol_,// type of relation
+                         int synset_offset,           // dest offset
+                         pos_t pos,                   // p.o.s. of dest
+                         int src,                     // word src
+                         int trgt,                    // word target
+                         synset& synset,              // source synset
+                         wordnet& wn,                 // our wordnet
+                         info_helper& info)           // helper
+    {
+      //if (pos == S || synset.pos == S)
+      //  return; //FIXME: check where are s synsets.
+
+      int u = synset.id;
+      int v = info.compute_indice(synset_offset, pos);
+
+      ptr p;
+      p.pointer_symbol = info.get_symbol(pointer_symbol_);
+      p.source = src;
+      p.target = trgt;
+
+      boost::add_edge(u,v, p, wn.wordnet_graph);
+    }
+
+
+    // load ptrs
+    void load_data_row_ptrs(std::stringstream& srow, synset& synset,
+                            wordnet& wn, info_helper& info)
+    {
+      srow >> synset.p_cnt;
+      for (std::size_t i = 0; i < synset.p_cnt; i++)
+      {
+        //http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3
+        //pointer_symbol  synset_offset  pos  source/target
+        std::string pointer_symbol_;
+        int   synset_offset;
+        pos_t pos;
+        int   src;
+        int   trgt;
+
+        srow >> pointer_symbol_;
+        srow >> synset_offset;
+
+        char c;
+        srow >> c;
+        pos = info.get_pos(c);
+
+        //print extracted edges
+        //std::cout << "(" << pointer_symbol << ", " << synset_offset;
+        //std::cout << ", " << pos << ")" << std::endl;
+
+        // Extract source/target words info
+        std::string src_trgt;
+        srow >> src_trgt;
+        std::stringstream ssrc(std::string(src_trgt,0,2));
+        std::stringstream strgt(std::string(src_trgt,2,2));
+        ssrc >> std::hex >> src >> std::dec;
+        strgt >> std::hex >> trgt >> std::dec;
+
+        add_wordnet_rel(pointer_symbol_, synset_offset, pos, src, trgt, synset, wn, info);
+      }
+    }
+
+
+    // Load a synset and add it to the wordnet class.
+    void load_data_row(const std::string& row, wordnet& wn, info_helper& info)
+    {
+      //http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3
+      // synset_offset lex_filenum ss_type w_cnt word lex_id [word lex_id...] p_cnt [ptr...] [frames...] | gloss
+      synset synset;
+
+      std::stringstream srow(row);
+      int synset_offset;
+      srow >> synset_offset;
+      srow >> synset.lex_filenum;
+      char ss_type;
+      srow >> ss_type;
+
+      // extra information
+      synset.pos = info.get_pos(ss_type);
+      synset.id  = info.compute_indice(synset_offset, synset.pos);
+
+      // words
+      load_data_row_words(srow, synset);
+
+      // ptrs
+      load_data_row_ptrs(srow, synset, wn, info);
+
+      //frames (skipped)
+      std::string tmp;
+      while (srow >> tmp)
+        if (tmp == "|")
+          break;
+
+      // gloss
+      std::getline(srow, synset.gloss);
+
+      // extra
+      synset.sense_number = 0;
+
+      // Add synset to graph
+      wn.wordnet_graph[synset.id] = synset;
+    }
+
+
+    // Parse data.noun files
+    void load_wordnet_data(const std::string& fn, wordnet& wn, info_helper& info)
+    {
+      std::ifstream fin(fn.c_str());
+      if (!fin.is_open())
+        throw std::runtime_error("File missing: " + fn);
+
+      static const int MAX_LENGTH = 20480;
+      char row[MAX_LENGTH];
+
+      //skip header
+      for(unsigned i = 0; i < 29; i++)
+        fin.getline(row, MAX_LENGTH);
+
+      //parse data line
+      while (fin.getline(row, MAX_LENGTH))
+        load_data_row(row, wn, info);
+
+      fin.close();
+    }
+
+
+    //FIXME: It seems possible to replace synset_offsets with indice here.
+    void load_index_row(const std::string& row, wordnet& wn, info_helper& info)
+    {
+      // lemma pos synset_cnt p_cnt [ptr_symbol...] sense_cnt tagsense_cnt synset_offset [synset_offset...]
+      index index;
+      std::stringstream srow(row);
+
+      char pos;
+      srow >> index.lemma;
+      srow >> pos;
+      index.pos = info.get_pos(pos); // extra data
+      srow >> index.synset_cnt;
+      srow >> index.p_cnt;
+
+      std::string tmp_p;
+      for (std::size_t i = 0; i < index.p_cnt; i++)
+      {
+        srow >> tmp_p;
+        index.ptr_symbols.push_back(tmp_p);
+      }
+      srow >> index.sense_cnt;
+      srow >> index.tagsense_cnt;
+
+      int tmp_o;
+      while (srow >> tmp_o)
+      {
+        index.synset_offsets.push_back(tmp_o);
+        index.synset_ids.push_back(info.compute_indice(tmp_o, index.pos)); // extra data
+      }
+
+      //add synset to index list
+      wn.index_list.push_back(index);
+    }
+
+
+    void load_wordnet_index(const std::string& fn, wordnet& wn, info_helper& info)
+    {
+      std::ifstream fin(fn.c_str());
+      if (!fin.is_open())
+        throw std::runtime_error("File Not Found: " + fn);
+
+      static const int MAX_LENGTH = 20480;
+      char row[MAX_LENGTH];
+
+      //skip header
+      const unsigned int header_nb_lines = 29;
+      for(std::size_t i = 0; i < header_nb_lines; i++)
+        fin.getline(row, MAX_LENGTH);
+
+      //parse data line
+      while (fin.getline(row, MAX_LENGTH))
+        load_index_row(row, wn, info);
+
+      fin.close();
+    }
+
+
+    void load_wordnet_exc(const std::string& dn, std::string cat,
+                          wordnet& wn, info_helper&)
+    {
+      std::string fn = dn + cat + ".exc";
+      std::ifstream fin(fn.c_str());
+      if (!fin.is_open())
+        throw std::runtime_error("File Not Found: " + fn);
+
+      std::map<std::string,std::string>& exc = wn.exc[get_pos_from_name(cat)];
+
+      std::string row;
+
+      std::string key, value;
+      while (std::getline(fin, row))
+      {
+        std::stringstream srow(row);
+        srow >> key;
+        srow >> value;
+
+        exc[key] = value;
+      }
+    }
+
+    void load_wordnet_cat(const std::string dn, std::string cat,
+                          wordnet& wn, info_helper& info)
+    {
+      load_wordnet_data((dn + "data." + cat), wn, info);
+      load_wordnet_index((dn + "index." + cat), wn, info);
+      load_wordnet_exc(dn, cat, wn, info);
+    }
+
+    // FIXME: this file is not in all packaged version of wordnet
+    void load_wordnet_index_sense(const std::string& dn, wordnet& wn, info_helper& info)
+    {
+      std::string fn = dn + "index.sense";
+      std::ifstream fin(fn.c_str());
+      if (!fin.is_open())
+        throw std::runtime_error("File Not Found: " + fn);
+
+      std::string row;
+      std::string sense_key;
+      int synset_offset;
+      while (std::getline(fin, row))
+      {
+        std::stringstream srow(row);
+        srow >> sense_key;
+
+        // Get the pos of the lemma
+        std::vector<std::string> sk = ext::split(sense_key,'%');
+        std::string word = sk.at(0);
+        std::stringstream tmp(ext::split(sk.at(1), ':').at(0));
+        int ss_type;
+        tmp >> ss_type;
+        pos_t pos =  (pos_t) ss_type;
+
+        srow >> synset_offset;
+
+        // Update synset info
+        int u = info.compute_indice(synset_offset, pos);
+        int sense_number;
+        srow >> sense_number;
+        wn.wordnet_graph[u].sense_number += sense_number;
+        int tag_cnt;
+        srow >> tag_cnt;
+        if (tag_cnt != 0)
+          wn.wordnet_graph[u].tag_cnts.push_back( make_pair(word,tag_cnt) );
+
+        //if (synset_offset == 2121620)
+        //  std::cout << u << " " << word << " " << synset_offset << " "
+        //            <<  wn.wordnet_graph[u].tag_cnt << " "
+        //            <<  wn.wordnet_graph[u].words[0] << std::endl;
+      }
+    }
+
+    // wn -over used info in cntlist even if this is deprecated
+    // It is ok not to FIX and use this function
+    void load_wordnet_cntlist(const std::string& dn, wordnet& wn, info_helper& info)
+    {
+      std::string fn = dn + "cntlist";
+      std::ifstream fin(fn.c_str());
+      if (!fin.is_open())
+        throw std::runtime_error("File Not Found: " + fn);
+
+      std::string sense_key;
+      int sense_number;
+      int tag_cnt;
+
+      std::string row;
+      while (std::getline(fin, row))
+      {
+        std::stringstream srow(row);
+
+        srow >> sense_key;
+        srow >> sense_number;
+        srow >> tag_cnt;
+
+        // Get the pos of the lemma
+        std::string word = ext::split(sense_key,'%').at(0);
+        std::stringstream tmp(ext::split(ext::split(sense_key,'%').at(1), ':').at(0));
+        int ss_type;
+        tmp >> ss_type;
+        pos_t pos = (pos_t) ss_type;
+
+        // Update synset info
+        int synset_offset; // FIXME
+        int u = info.compute_indice(synset_offset, pos);
+        wn.wordnet_graph[u].sense_number += sense_number;
+        if (tag_cnt != 0)
+          wn.wordnet_graph[u].tag_cnts.push_back( make_pair(word,tag_cnt) );
+      }
+    }
+
+  } // end of anonymous namespace
+
+  void load_wordnet(const std::string& dn, wordnet& wn, info_helper& info)
+  {
+    // vertex added in this order a n r v
+
+    std::string fn = dn;
+
+    if (wn._verbose)
+    {
+      std::cout << std::endl;
+      std::cout << "### Loading Wordnet 3.0";
+      boost::progress_display show_progress(5);
+      boost::progress_timer t;
+
+      load_wordnet_cat(dn, "adj", wn, info);
+      ++show_progress;
+      load_wordnet_cat(dn, "noun", wn, info);
+      ++show_progress;
+      load_wordnet_cat(dn, "adv", wn, info);
+      ++show_progress;
+      load_wordnet_cat(dn, "verb", wn, info);
+      ++show_progress;
+      load_wordnet_index_sense(dn, wn, info);
+      ++show_progress;
+      std::cout << std::endl;
+    }
+    else
+    {
+      load_wordnet_cat(dn, "adj", wn, info);
+      load_wordnet_cat(dn, "noun", wn, info);
+      load_wordnet_cat(dn, "adv", wn, info);
+      load_wordnet_cat(dn, "verb", wn, info);
+      load_wordnet_index_sense(dn, wn, info);
+    }
+
+    std::stable_sort(wn.index_list.begin(), wn.index_list.end());
+  }
+
+} // end of namespace wnb
--- a/contrib/wordnet-blast/wnb/core/load_wordnet.hh
+++ b/contrib/wordnet-blast/wnb/core/load_wordnet.hh
@ -0,0 +1,12 @@
+#pragma once
+
+# include "info_helper.hh"
+
+namespace wnb
+{
+  /// forward declaration
+  struct wordnet;
+
+  /// Load the entire wordnet data base located in \p dn (typically .../dict/)
+  void load_wordnet(const std::string& dn, wordnet& wn, info_helper& info);
+}
--- a/contrib/wordnet-blast/wnb/core/pos_t.hh
+++ b/contrib/wordnet-blast/wnb/core/pos_t.hh
@ -0,0 +1,61 @@
+#pragma once
+
+namespace wnb
+{
+
+  static const std::size_t POS_ARRAY_SIZE = 6;
+  static const char POS_ARRAY[POS_ARRAY_SIZE] = {'u', 'n', 'v', 'a', 'r', 's'};
+
+  enum pos_t
+  	{
+        UNKNOWN = 0,
+        N       = 1,
+        V       = 2,
+        A       = 3,
+        R       = 4,
+        S       = 5,
+  	};
+
+
+  inline pos_t get_pos_from_name(const std::string& pos)
+  {
+    if (pos == "adj")
+      return A;
+    if (pos == "noun")
+      return N;
+    if (pos == "adv")
+      return R;
+    if (pos == "verb")
+      return V;
+    if (pos == "adj sat")
+      return S;
+    return UNKNOWN;
+  }
+
+  inline std::string get_name_from_pos(const pos_t& pos)
+  {
+    switch (pos)
+    {
+    case A: return "adj";
+    case N: return "noun";
+    case R: return "adv";
+    case V: return "verb";
+    case S: return "adj sat";
+    default: return "UNKNOWN";
+    }
+  }
+
+  inline pos_t get_pos_from_char(const char& c)
+  {
+    switch (c)
+    {
+    case 'a': return A;
+    case 'n': return N;
+    case 'r': return R;
+    case 'v': return V;
+    case 's': return S;
+    default: return UNKNOWN;
+    }
+  }
+
+} // end of namespace wncpp
--- a/contrib/wordnet-blast/wnb/core/wordnet.cc
+++ b/contrib/wordnet-blast/wnb/core/wordnet.cc
@ -0,0 +1,186 @@
+#include <wnb/core/wordnet.hh>
+#include <wnb/std_ext.hh>
+
+#include <string>
+#include <set>
+#include <algorithm>
+#include <stdexcept>
+#include <boost/graph/breadth_first_search.hpp>
+#include <boost/graph/filtered_graph.hpp>
+
+namespace wnb
+{
+
+  //FIXME: Make (smart) use of fs::path
+  wordnet::wordnet(const std::string& wordnet_dir, bool verbose)
+    : _verbose(verbose)
+  {
+    if (_verbose)
+    {
+      std::cout << wordnet_dir << std::endl;
+    }
+
+    info = preprocess_wordnet(wordnet_dir);
+
+    wordnet_graph = graph(info.nb_synsets());
+    load_wordnet(wordnet_dir, *this, info);
+
+    if (_verbose)
+    {
+      std::cout << "nb_synsets: " << info.nb_synsets() << std::endl;
+    }
+    //FIXME: this check is only valid for Wordnet 3.0
+    //assert(info.nb_synsets() == 142335);//117659);
+    assert(info.nb_synsets() > 0);
+  }
+
+  std::vector<synset>
+  wordnet::get_synsets(const std::string& word, pos_t pos)
+  {
+    std::vector<synset> synsets;
+
+    // morphing
+    std::string mword = morphword(word, pos);
+    if (mword == "")
+      return synsets;
+
+    // binary_search
+    typedef std::vector<index> vi;
+    std::pair<vi::iterator,vi::iterator> bounds = get_indexes(mword);
+
+    vi::iterator it;
+    for (it = bounds.first; it != bounds.second; it++)
+    {
+      if (pos == pos_t::UNKNOWN || it->pos == pos)
+      {
+        for (std::size_t i = 0; i < it->synset_ids.size(); i++)
+        {
+          int id = it->synset_ids[i];
+          synsets.push_back(wordnet_graph[id]);
+        }
+      }
+    }
+
+    return synsets;
+  }
+
+  const std::vector<std::string> *
+  wordnet::get_synset(const std::string& word, pos_t pos) const {
+    
+    typedef std::vector<index> vi;
+    std::pair<vi::const_iterator,vi::const_iterator> bounds = get_indexes_const(word);
+
+    for (vi::const_iterator it = bounds.first; it != bounds.second; it++)
+    {
+      if (pos == pos_t::UNKNOWN || it->pos == pos)
+      {
+        int id = it->synset_ids[0];
+        return &wordnet_graph[id].words;
+      }
+    }
+    return nullptr;
+  }
+
+  std::pair<std::vector<index>::const_iterator, std::vector<index>::const_iterator>
+  wordnet::get_indexes_const(const std::string& word) const
+  {
+    index light_index;
+    light_index.lemma = word;
+
+    typedef std::vector<index> vi;
+    std::pair<vi::const_iterator,vi::const_iterator> bounds =
+      std::equal_range(index_list.begin(), index_list.end(), light_index);
+
+    return bounds;
+  }
+
+  std::pair<std::vector<index>::iterator, std::vector<index>::iterator>
+  wordnet::get_indexes(const std::string& word)
+  {
+    index light_index;
+    light_index.lemma = word;
+
+    typedef std::vector<index> vi;
+    std::pair<vi::iterator,vi::iterator> bounds =
+      std::equal_range(index_list.begin(), index_list.end(), light_index);
+
+    return bounds;
+  }
+
+  std::string
+  wordnet::wordbase(const std::string& word, int ender)
+  {
+    if (ext::ends_with(word, info.sufx[ender]))
+    {
+      int sufxlen = info.sufx[ender].size();
+      std::string strOut = word.substr(0, word.size() - sufxlen);
+      if (!info.addr[ender].empty())
+        strOut += info.addr[ender];
+      return strOut;
+    }
+    return word;
+  }
+
+  bool is_defined(const std::string& word, pos_t pos)
+  {
+    // hack FIXME: Some verbs are built with -e suffix ('builde' is just an example).
+    if (pos == V && word == "builde")
+      return false;
+    return true;
+  }
+
+  // Try to find baseform (lemma) of individual word in POS
+  std::string
+  wordnet::morphword(const std::string& word, pos_t pos)
+  {
+    // first look for word on exception list
+    exc_t::iterator it = exc[pos].find(word);
+    if (it != exc[pos].end())
+      return it->second; // found in exception list
+
+    std::string tmpbuf;
+    std::string end;
+    int cnt = 0;
+
+    if (pos == R)
+      return ""; // Only use exception list for adverbs
+
+    if (pos == N)
+    {
+      if (ext::ends_with(word, "ful"))
+      {
+        cnt = word.size() - 3;
+        tmpbuf = word.substr(0, cnt);
+        end = "ful";
+      }
+      else
+      {
+        // check for noun ending with 'ss' or short words
+        if (ext::ends_with(word, "ss") || word.size() <= 2)
+          return "";
+      }
+    }
+
+    // If not in exception list, try applying rules from tables
+
+    if (tmpbuf.size() == 0)
+      tmpbuf = word;
+
+    if (pos != pos_t::UNKNOWN) 
+    {
+      int offset  = info.offsets[pos];
+      int pos_cnt = info.cnts[pos];
+
+      std::string morphed;
+      for  (int i = 0; i < pos_cnt; i++)
+      {
+        morphed = wordbase(tmpbuf, (i + offset));
+        if (morphed != tmpbuf && is_defined(morphed, pos))
+           return morphed + end;
+      }
+      return morphed;
+    }
+    return word;
+  }
+
+} // end of namespace wnb
--- a/contrib/wordnet-blast/wnb/core/wordnet.hh
+++ b/contrib/wordnet-blast/wnb/core/wordnet.hh
@ -0,0 +1,113 @@
+#pragma once
+
+# include <iostream>
+# include <string>
+# include <cassert>
+# include <vector>
+//# include <boost/filesystem.hpp>
+
+//Possible https://bugs.launchpad.net/ubuntu/+source/boost/+bug/270873
+# include <boost/graph/graph_traits.hpp>
+# include <boost/graph/adjacency_list.hpp>
+
+# include "load_wordnet.hh"
+# include "pos_t.hh"
+
+namespace wnb
+{
+
+  /// More info here: http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html
+
+  struct info_helper;
+
+  /// Synset
+  struct synset
+  {
+    int  lex_filenum;
+    std::size_t  w_cnt;
+    std::vector<std::string> words;
+    std::vector<int> lex_ids;
+    std::size_t p_cnt;
+    std::string gloss;
+
+    // extra
+    pos_t pos;        ///< pos (replace ss_type)
+    int id;           ///< unique identifier (replace synset_offset)
+    int sense_number; ///< http://wordnet.princeton.edu/man/senseidx.5WN.html
+    std::vector<std::pair<std::string, int> > tag_cnts; ///< http://wordnet.princeton.edu/man/senseidx.5WN.html
+
+    bool operator==(const synset& s) const { return (id == s.id);  }
+    bool operator<(const synset& s) const { return (id < s.id);   }
+  };
+
+
+  /// Rel between synsets properties
+  struct ptr
+  {
+    //std::string pointer_symbol; ///< symbol of the relation
+    int pointer_symbol;
+    int source; ///< source word inside synset
+    int target; ///< target word inside synset
+  };
+
+
+  /// Index
+  struct index
+  {
+    std::string lemma;
+
+    std::size_t synset_cnt;
+    std::size_t p_cnt;
+    std::size_t sense_cnt;
+    float       tagsense_cnt;
+    std::vector<std::string> ptr_symbols;
+    std::vector<int>         synset_offsets;
+
+    // extra
+    std::vector<int> synset_ids;
+    pos_t pos;
+
+    bool operator<(const index& b) const
+    {
+      return (lemma.compare(b.lemma) < 0);
+    }
+  };
+
+
+  /// Wordnet interface class
+  struct wordnet
+  {
+    typedef boost::adjacency_list<boost::vecS, boost::vecS,
+                                  boost::directedS,
+                                  synset, ptr> graph; ///< boost graph type
+
+    /// Constructor
+    wordnet(const std::string& wordnet_dir, bool verbose=false);
+
+    /// Return synsets matching word
+    std::vector<synset> get_synsets(const std::string& word, pos_t pos = pos_t::UNKNOWN);
+    //FIXME: todo
+    std::vector<synset> get_synset(const std::string& word, char pos, int i);
+    // added
+    const std::vector<std::string> * get_synset(const std::string& word, pos_t pos = pos_t::UNKNOWN) const;
+
+    std::pair<std::vector<index>::iterator, std::vector<index>::iterator>
+    get_indexes(const std::string& word);
+
+    std::pair<std::vector<index>::const_iterator, std::vector<index>::const_iterator>
+    get_indexes_const(const std::string& word) const;
+
+    std::string wordbase(const std::string& word, int ender);
+
+    std::string morphword(const std::string& word, pos_t pos);
+
+    std::vector<index> index_list;    ///< index list // FIXME: use a map
+    graph              wordnet_graph; ///< synsets graph
+    info_helper        info;          ///< helper object
+    bool               _verbose;
+
+    typedef std::map<std::string,std::string> exc_t;
+    std::map<pos_t, exc_t> exc;
+  };
+
+} // end of namespace wnb
--- a/contrib/wordnet-blast/wnb/main.cc
+++ b/contrib/wordnet-blast/wnb/main.cc
@ -0,0 +1,180 @@
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <boost/progress.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include <wnb/core/wordnet.hh>
+#include <wnb/core/load_wordnet.hh>
+#include <wnb/core/info_helper.hh>
+#include <wnb/nltk_similarity.hh>
+#include <wnb/std_ext.hh>
+
+using namespace wnb;
+using namespace boost;
+using namespace boost::algorithm;
+
+bool usage(int argc, char ** argv)
+{
+  std::string dir;
+  if (argc >= 2)
+    dir = std::string(argv[1]);
+  if (argc != 3 || dir[dir.length()-1] != '/')
+  {
+    std::cout << argv[0] << " .../wordnet_dir/ word_list_file" << std::endl;
+    return true;
+  }
+  return false;
+}
+
+struct ws
+{
+  std::string w;
+  float       s;
+
+  bool operator<(const ws& a) const {return s > a.s;}
+};
+
+
+/// Compute similarity of word with words in word list
+std::vector<ws>
+compute_similarities(wordnet& wn,
+                     const std::string& word,
+                     const std::vector<std::string>& word_list)
+{
+  std::vector<ws> wslist;
+  std::vector<synset> synsets1 = wn.get_synsets(word);
+
+  for (unsigned i = 0; i < synsets1.size(); i++)
+    for (unsigned k = 0; k < synsets1[i].words.size(); k++)
+      std::cout << " - " << synsets1[i].words[k] << std::endl;
+
+  nltk_similarity path_similarity(wn);
+  {
+    progress_timer t;
+    progress_display show_progress(word_list.size());
+
+    for (unsigned k = 0; k < word_list.size(); k++)
+    {
+      const std::string& w = word_list[k];
+      float max = 0;
+      std::vector<synset> synsets2 = wn.get_synsets(w);
+      for (unsigned i = 0; i < synsets1.size(); i++)
+      {
+        for (unsigned j = 0; j < synsets2.size(); j++)
+        {
+          float s = path_similarity(synsets1[i], synsets2[j], 6);
+          if (s > max)
+            max = s;
+        }
+      }
+      ws e = {w, max};
+      wslist.push_back(e);
+      ++show_progress;
+    }
+  }
+
+  return wslist;
+}
+
+void similarity_test(wordnet&                  wn,
+                     const std::string&        word,
+                     std::vector<std::string>& word_list)
+{
+  std::vector<ws> wslist = compute_similarities(wn, word, word_list);
+
+  std::stable_sort(wslist.begin(), wslist.end());
+  for (unsigned i = 0; i < std::min(wslist.size(), size_t(10)); i++)
+    std::cout << wslist[i].w << " " << wslist[i].s << std::endl;
+}
+
+void print_synsets(pos_t pos, wnb::index& idx, wordnet& wn)
+{
+  std::string& mword = idx.lemma;
+  std::cout << "\nOverview of " << get_name_from_pos(pos) << " " << mword << "\n\n";
+  std::cout << "The " << get_name_from_pos(pos) << " " << mword << " has "
+            << idx.synset_ids.size() << ((idx.synset_ids.size() == 1) ? " sense": " senses");
+
+  if (idx.tagsense_cnt != 0)
+    std::cout << " (first " << idx.tagsense_cnt << " from tagged texts)";
+  else
+    std::cout << " (no senses from tagged texts)";
+
+  std::cout << "\n";
+  std::cout << "                                      \n";
+
+  for (std::size_t i = 0; i < idx.synset_ids.size(); i++)
+  {
+    int id = idx.synset_ids[i];
+    const synset& synset = wn.wordnet_graph[id];
+
+    std::cout << i+1 << ". ";
+    for (std::size_t k = 0; k < synset.tag_cnts.size(); k++)
+    {
+      if (synset.tag_cnts[k].first == mword)
+        std::cout << "(" << synset.tag_cnts[k].second << ") ";
+    }
+
+    std::vector<std::string> nwords;
+    for (auto& w : synset.words)
+      nwords.push_back((pos == A) ? w.substr(0, w.find_first_of("(")) : w);
+
+    std::cout << replace_all_copy(join(nwords, ", "), "_", " ");
+    std::cout << " -- (" << trim_copy(synset.gloss) << ")";
+    std::cout << std::endl;
+  }
+}
+
+void wn_like(wordnet& wn, const std::string& word, pos_t pos)
+{
+  if (word == "")
+    return;
+
+  typedef std::vector<wnb::index> vi;
+  std::pair<vi::iterator,vi::iterator> bounds = wn.get_indexes(word);
+
+  for (vi::iterator it = bounds.first; it != bounds.second; it++)
+  {
+    if (pos != -1 && it->pos == pos)
+    {
+      print_synsets(pos, *it, wn);
+    }
+  }
+}
+
+void batch_test(wordnet& wn, std::vector<std::string>& word_list)
+{
+  for (std::size_t i = 0; i < word_list.size(); i++)
+  {
+    for (unsigned p = 1; p < POS_ARRAY_SIZE; p++)
+    {
+      pos_t pos = (pos_t) p;
+
+      wn_like(wn, word_list[i], pos);
+      std::string mword = wn.morphword(word_list[i], pos);
+      if (mword != word_list[i])
+        wn_like(wn, mword, pos);
+    }
+  }
+}
+
+int main(int argc, char ** argv)
+{
+  if (usage(argc, argv))
+    return 1;
+
+  // read command line
+  std::string wordnet_dir = argv[1];
+  std::string test_file   = argv[2];
+
+  wordnet wn(wordnet_dir);
+
+  // read test file
+  std::string list = ext::read_file(test_file);
+  std::vector<std::string> wl        =  ext::split(list);
+
+  batch_test(wn, wl);
+}
+
--- a/contrib/wordnet-blast/wnb/nltk_similarity.hh
+++ b/contrib/wordnet-blast/wnb/nltk_similarity.hh
@ -0,0 +1,146 @@
+#ifndef _NLTK_SIMILARITY_HH
+# define _NLTK_SIMILARITY_HH
+
+# include <queue>
+# include <boost/graph/filtered_graph.hpp>
+# include <wnb/core/wordnet.hh>
+
+namespace wnb
+{
+  namespace internal
+  {
+
+    //Helper class filtering out other than hypernym relations
+    template <typename PointerSymbolMap>
+    struct hyper_edge
+    {
+      hyper_edge() { }
+
+      hyper_edge(PointerSymbolMap pointer_symbol)
+        : m_pointer_symbol(pointer_symbol) { }
+
+      template <typename Edge>
+      bool operator()(const Edge& e) const
+      {
+        int p_s = get(m_pointer_symbol, e);
+        return p_s == 1; // hypernyme (instance_hypernyme not used here)
+      }
+
+      PointerSymbolMap m_pointer_symbol;
+    };
+
+  } // end of anonymous namespace
+
+
+  class nltk_similarity
+  {
+
+    typedef boost::property_map<wordnet::graph,
+                                int ptr::*>::type PointerSymbolMap;
+    typedef boost::filtered_graph<wordnet::graph,
+                                  internal::hyper_edge<PointerSymbolMap> > G;
+    typedef boost::graph_traits<G>::vertex_descriptor vertex;
+
+    internal::hyper_edge<PointerSymbolMap> filter;
+    G fg;
+
+  public:
+
+    nltk_similarity(wordnet& wn)
+      : filter(get(&ptr::pointer_symbol, wn.wordnet_graph)),
+                   fg(wn.wordnet_graph, filter)
+    { }
+
+    /// Get list of hypernyms of s along with distance to s
+    std::map<vertex, int> hypernym_map(vertex s);
+
+    /// Get shortest path between and synset1 and synset2.
+    int shortest_path_distance(const synset& synset1, const synset& synset2);
+
+    /// return disance
+    float operator()(const synset& synset1, const synset& synset2, int=0);
+
+  };
+
+  std::map<nltk_similarity::vertex, int>
+  nltk_similarity::hypernym_map(nltk_similarity::vertex s)
+  {
+    std::map<vertex, int> map;
+
+    // Python:
+    // for (hypernym in self[HYPERNYM])
+    //   distances |= hypernym.hypernym_distances(distance+1);
+
+    boost::graph_traits<G>::out_edge_iterator e, e_end;
+    std::queue<vertex> q;
+
+    q.push(s);
+    map[s] = 0;
+    while (!q.empty())
+    {
+      vertex u = q.front(); q.pop();
+
+      int new_d = map[u] + 1;
+      for (boost::tuples::tie(e, e_end) = out_edges(u, fg); e != e_end; ++e)
+      {
+        vertex v = target(*e,fg);
+        q.push(v);
+
+        if (map.find(v) != map.end())
+        {
+          if (new_d < map[v])
+            map[v] = new_d;
+          else
+            q.pop();
+        }
+        else
+          map[v] = new_d;
+      }
+    }
+
+    return map;
+  }
+
+
+  int
+  nltk_similarity::shortest_path_distance(const synset& synset1, const synset& synset2)
+  {
+    vertex v1 = synset1.id;
+    vertex v2 = synset2.id;
+
+    std::map<vertex, int> map1 = hypernym_map(v1);
+    std::map<vertex, int> map2 = hypernym_map(v2);
+
+    // For each ancestor synset common to both subject synsets, find the
+    // connecting path length. Return the shortest of these.
+
+    int path_distance = -1;
+    std::map<vertex, int>::iterator it, it2;
+    for (it = map1.begin(); it != map1.end(); it++)
+      for (it2 = map2.begin(); it2 != map2.end(); it2++)
+        if (fg[it->first] == fg[it2->first])
+        {
+          int new_distance = it->second + it2->second;
+          if (path_distance < 0 || new_distance < path_distance)
+            path_distance = new_distance;
+        }
+
+    return path_distance;
+  }
+
+
+  float
+  nltk_similarity::operator()(const synset& synset1, const synset& synset2, int)
+  {
+    int distance = shortest_path_distance(synset1, synset2);
+    if (distance >= 0)
+      return 1. / (distance + 1);
+    else
+      return -1;
+  }
+
+
+} // end of namespace wnb
+
+#endif /* _NLTK_SIMILARITY_HH */
+
--- a/contrib/wordnet-blast/wnb/std_ext.hh
+++ b/contrib/wordnet-blast/wnb/std_ext.hh
@ -0,0 +1,90 @@
+#ifndef _STD_EXT_HH
+# define _STD_EXT_HH
+
+# include <string>
+# include <sstream>
+# include <fstream>
+# include <algorithm>
+# include <stdexcept>
+
+namespace ext
+{
+  /// Read a file, return the content as a C++ string
+  inline
+  std::string read_file(const std::string& fn)
+  {
+    std::ifstream is;
+    is.open(fn.c_str(), std::ios::binary);
+    if (!is.is_open())
+      throw std::runtime_error("File not found: " + fn);
+
+    std::string str((std::istreambuf_iterator<char>(is)),
+                     std::istreambuf_iterator<char>());
+
+    return str;
+  }
+
+  /// Split a std::string
+  inline
+  std::vector<std::string> split(const std::string& str)
+  {
+    std::vector<std::string> tokens;
+    std::istringstream iss(str);
+    copy(std::istream_iterator<std::string>(iss),
+         std::istream_iterator<std::string>(),
+         std::back_inserter< std::vector<std::string> >(tokens));
+    return tokens;
+  }
+
+  /// Split a std::string on separator
+  inline
+  std::vector<std::string> split(const std::string& s, char seperator)
+  {
+    std::vector<std::string> output;
+    std::string::size_type prev_pos = 0, pos = 0;
+
+    while((pos = s.find(seperator, pos)) != std::string::npos)
+    {
+      std::string substring( s.substr(prev_pos, pos-prev_pos) );
+      output.push_back(substring);
+      prev_pos = ++pos;
+    }
+
+    output.push_back(s.substr(prev_pos, pos-prev_pos));
+    return output;
+  }
+
+  inline
+  bool
+  ends_with(const std::string& str, const std::string& ending)
+  {
+    if (str.length() >= ending.length())
+    {
+      int cmp = str.compare(str.length() - ending.length(),
+                            ending.length(), ending);
+      return (0 == cmp);
+    }
+    return false;
+}
+
+
+  /// Sorted unique
+  template <typename T>
+  inline
+  T s_unique(T& v)
+  {
+    T out;
+
+    std::sort(v.begin(), v.end());
+    typename T::iterator last = std::unique(v.begin(),v.end());
+
+    out.resize(last - v.begin());
+    std::copy(v.begin(), last, out.begin());
+
+    return out;
+  }
+
+} // end of ext
+
+#endif /* _STD_EXT_HH */
+
--- a/src/Functions/CMakeLists.txt
+++ b/src/Functions/CMakeLists.txt
@ -124,3 +124,4 @@ endif()
 set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-sanitize=signed-integer-overflow)

 target_link_libraries(clickhouse_functions PRIVATE stemmer)
+target_link_libraries(clickhouse_functions PRIVATE wnb)
--- a/src/Interpreters/SynonymsExtensions.cpp
+++ b/src/Interpreters/SynonymsExtensions.cpp
@ -1,10 +1,11 @@
 #include <Common/Exception.h>
-#include <Interpreters/SynonymsExtensions.h>
+#include <Functions/SynonymsExtensions.h>

 #include <fstream>
 #include <list>

 #include <boost/algorithm/string.hpp>
+#include <wnb/core/wordnet.hh>

 namespace DB
 {
@ -48,7 +49,7 @@ public:
        }
    }

-    Synset * getSynonyms(const std::string_view & token) const override
+    const Synset * getSynonyms(const std::string_view & token) const override
    {
        auto it = table.find(token);

@ -62,20 +63,23 @@ public:
 class WordnetSynonymsExtension : public ISynonymsExtension
 {
 private:
-    // std::vector<std::vector<String>> data;
+    wnb::wordnet wn;

 public:
-    WordnetSynonymsExtension(const String & /*path*/)
-    {
+    WordnetSynonymsExtension(const String & path) : wn(path) {}

-    }
-
-    Synset * getSynonyms(const std::string_view & /*token*/) const override
+    const Synset * getSynonyms(const std::string_view & token) const override
    {
-        return nullptr;
+        return wn.get_synset(std::string(token));
    }
 };

+/// Duplicate of code from StringUtils.h. Copied here for less dependencies.
+static bool startsWith(const std::string & s, const char * prefix)
+{
+    return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix));
+}
+
 SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration & config)
 {
    String prefix = "synonyms_extensions";
@ -89,7 +93,7 @@ SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration &

    for (const auto & key : keys)
    {
-        if (key == "extension")
+        if (startsWith(key, "extension"))
        {
            const auto & ext_name = config.getString(prefix + "." + key + ".name", "");
            const auto & ext_path = config.getString(prefix + "." + key + ".path", "");
--- a/src/Interpreters/SynonymsExtensions.h
+++ b/src/Interpreters/SynonymsExtensions.h
@ -19,7 +19,7 @@ public:
    
    //ISynonymsExtension(const String & path);

-    virtual Synset * getSynonyms(const std::string_view & token) const = 0;
+    virtual const Synset * getSynonyms(const std::string_view & token) const = 0;

    virtual ~ISynonymsExtension() = default;
 };
--- a/src/Functions/synonyms.cpp
+++ b/src/Functions/synonyms.cpp
@ -6,7 +6,7 @@
 #include <Functions/FunctionFactory.h>
 #include <Functions/FunctionHelpers.h>
 #include <Functions/IFunction.h>
-#include <Interpreters/SynonymsExtensions.h>
+#include <Functions/SynonymsExtensions.h>
 #include <Interpreters/Context.h>

 #include <string_view>
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -53,7 +53,6 @@
 #include <Interpreters/InterserverCredentials.h>
 #include <Interpreters/Cluster.h>
 #include <Interpreters/InterserverIOHandler.h>
-#include <Interpreters/SynonymsExtensions.h>
 #include <Interpreters/SystemLog.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/DDLWorker.h>
@ -75,7 +74,7 @@
 #include <Interpreters/DatabaseCatalog.h>
 #include <Storages/MergeTree/BackgroundJobsExecutor.h>
 #include <Storages/MergeTree/MergeTreeDataPartUUID.h>
-
+#include <Functions/SynonymsExtensions.h>

 namespace ProfileEvents
 {