From ed12fb5604c308ab1cc1af684187aa38540376f0 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Thu, 3 Jun 2021 19:28:12 +0000 Subject: [PATCH] added WordNet synonyms extensions --- contrib/CMakeLists.txt | 1 + contrib/boost-cmake/CMakeLists.txt | 18 +- contrib/wordnet-blast-cmake/CMakeLists.txt | 13 + contrib/wordnet-blast/AUTHORS | 1 + contrib/wordnet-blast/CMakeLists.txt | 65 ++ contrib/wordnet-blast/README | 43 + contrib/wordnet-blast/WORDNET_LICENSE | 25 + contrib/wordnet-blast/changelog | 11 + contrib/wordnet-blast/check/biglist.txt | 852 ++++++++++++++++++ contrib/wordnet-blast/check/check.sh | 16 + contrib/wordnet-blast/check/list.txt | 7 + contrib/wordnet-blast/wnb/bfs.hh | 72 ++ contrib/wordnet-blast/wnb/core/info_helper.cc | 148 +++ contrib/wordnet-blast/wnb/core/info_helper.hh | 85 ++ .../wordnet-blast/wnb/core/load_wordnet.cc | 381 ++++++++ .../wordnet-blast/wnb/core/load_wordnet.hh | 12 + contrib/wordnet-blast/wnb/core/pos_t.hh | 61 ++ contrib/wordnet-blast/wnb/core/wordnet.cc | 186 ++++ contrib/wordnet-blast/wnb/core/wordnet.hh | 113 +++ contrib/wordnet-blast/wnb/main.cc | 180 ++++ contrib/wordnet-blast/wnb/nltk_similarity.hh | 146 +++ contrib/wordnet-blast/wnb/std_ext.hh | 90 ++ src/Functions/CMakeLists.txt | 3 +- .../SynonymsExtensions.cpp | 24 +- .../SynonymsExtensions.h | 2 +- src/Functions/synonyms.cpp | 2 +- src/Interpreters/Context.cpp | 3 +- 27 files changed, 2544 insertions(+), 16 deletions(-) create mode 100644 contrib/wordnet-blast-cmake/CMakeLists.txt create mode 100644 contrib/wordnet-blast/AUTHORS create mode 100644 contrib/wordnet-blast/CMakeLists.txt create mode 100644 contrib/wordnet-blast/README create mode 100644 contrib/wordnet-blast/WORDNET_LICENSE create mode 100644 contrib/wordnet-blast/changelog create mode 100644 contrib/wordnet-blast/check/biglist.txt create mode 100644 contrib/wordnet-blast/check/check.sh create mode 100644 contrib/wordnet-blast/check/list.txt create mode 100644 contrib/wordnet-blast/wnb/bfs.hh create mode 100644 contrib/wordnet-blast/wnb/core/info_helper.cc create mode 100644 contrib/wordnet-blast/wnb/core/info_helper.hh create mode 100644 contrib/wordnet-blast/wnb/core/load_wordnet.cc create mode 100644 contrib/wordnet-blast/wnb/core/load_wordnet.hh create mode 100644 contrib/wordnet-blast/wnb/core/pos_t.hh create mode 100644 contrib/wordnet-blast/wnb/core/wordnet.cc create mode 100644 contrib/wordnet-blast/wnb/core/wordnet.hh create mode 100644 contrib/wordnet-blast/wnb/main.cc create mode 100644 contrib/wordnet-blast/wnb/nltk_similarity.hh create mode 100644 contrib/wordnet-blast/wnb/std_ext.hh rename src/{Interpreters => Functions}/SynonymsExtensions.cpp (86%) rename src/{Interpreters => Functions}/SynonymsExtensions.h (91%) diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 57aef8beef3..c4d08448e89 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -331,3 +331,4 @@ endif() add_subdirectory(fast_float) add_subdirectory(libstemmer-c-cmake) +add_subdirectory(wordnet-blast-cmake) diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index 9f6c5b1255d..d00179f13cd 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -13,11 +13,12 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) regex context coroutine + graph ) if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY AND - Boost_COROUTINE_LIBRARY) + Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY) set(EXTERNAL_BOOST_FOUND 1) @@ -32,6 +33,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) add_library (_boost_system INTERFACE) add_library (_boost_context INTERFACE) add_library (_boost_coroutine INTERFACE) + add_library (_boost_graph INTERFACE) target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY}) target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY}) @@ -40,6 +42,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY}) target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY}) target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY}) + target_link_libraries (_boost_graph INTERFACE ${Boost_GRAPH_LIBRARY}) add_library (boost::filesystem ALIAS _boost_filesystem) add_library (boost::iostreams ALIAS _boost_iostreams) @@ -48,6 +51,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) add_library (boost::system ALIAS _boost_system) add_library (boost::context ALIAS _boost_context) add_library (boost::coroutine ALIAS _boost_coroutine) + add_library (boost::graph ALIAS _boost_graph) else() set(EXTERNAL_BOOST_FOUND 0) message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost") @@ -221,4 +225,16 @@ if (NOT EXTERNAL_BOOST_FOUND) add_library (boost::coroutine ALIAS _boost_coroutine) target_include_directories (_boost_coroutine PRIVATE ${LIBRARY_DIR}) target_link_libraries(_boost_coroutine PRIVATE _boost_context) + + # graph + + set (SRCS_GRAPH + "${LIBRARY_DIR}/libs/graph/src/graphml.cpp" + "${LIBRARY_DIR}/libs/graph/src/read_graphviz_new.cpp" + ) + + add_library (_boost_graph ${SRCS_GRAPH}) + add_library (boost::graph ALIAS _boost_graph) + target_include_directories (_boost_graph PRIVATE ${LIBRARY_DIR}) + endif () diff --git a/contrib/wordnet-blast-cmake/CMakeLists.txt b/contrib/wordnet-blast-cmake/CMakeLists.txt new file mode 100644 index 00000000000..8d59c312664 --- /dev/null +++ b/contrib/wordnet-blast-cmake/CMakeLists.txt @@ -0,0 +1,13 @@ +set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/wordnet-blast") + +set(SRCS + "${LIBRARY_DIR}/wnb/core/info_helper.cc" + "${LIBRARY_DIR}/wnb/core/load_wordnet.cc" + "${LIBRARY_DIR}/wnb/core/wordnet.cc" +) + +add_library(wnb ${SRCS}) + +target_link_libraries(wnb PRIVATE boost::headers_only boost::graph) + +target_include_directories(wnb PUBLIC "${LIBRARY_DIR}") \ No newline at end of file diff --git a/contrib/wordnet-blast/AUTHORS b/contrib/wordnet-blast/AUTHORS new file mode 100644 index 00000000000..6f4850f0a96 --- /dev/null +++ b/contrib/wordnet-blast/AUTHORS @@ -0,0 +1 @@ +Ugo Jardonnet ugo.jardonnet/gmail \ No newline at end of file diff --git a/contrib/wordnet-blast/CMakeLists.txt b/contrib/wordnet-blast/CMakeLists.txt new file mode 100644 index 00000000000..457b5e7aa29 --- /dev/null +++ b/contrib/wordnet-blast/CMakeLists.txt @@ -0,0 +1,65 @@ +CMAKE_MINIMUM_REQUIRED(VERSION 2.6) + +PROJECT(wnb) + +# Boost dependency +#-------------------------------------------------- + +# IF (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") +# SET (BOOST_ROOT /Developer/boost_build/) # Suggested path +# ELSE() +# SET (BOOST_ROOT "/usr/include") +# ENDIF() +############## +SET (BOOST_ROOT "${ClickHouse_SOURCE_DIR}/contrib/boost") +############## +MESSAGE(STATUS "** Search Boost root: ${BOOST_ROOT}") +FIND_PACKAGE(Boost 1.70.0 COMPONENTS graph REQUIRED) +MESSAGE(STATUS "** Boost Include: ${Boost_INCLUDE_DIR}") +MESSAGE(STATUS "** Boost Libraries: ${Boost_LIBRARY_DIRS}") +MESSAGE(STATUS "** Boost Libraries: ${Boost_LIBRARIES}") + +INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIR}) +LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) + +# Project +#-------------------------------------------------- + +LINK_DIRECTORIES(${wnb_SOURCE_DIR}/lib) +INCLUDE_DIRECTORIES(${PROJECT_BINARY_DIR}) + +SET(PROJECT_VERSION "0.6") +SET(ARCHIVE_NAME ${CMAKE_PROJECT_NAME}-${PROJECT_VERSION}) + +ADD_CUSTOM_TARGET(dist + COMMAND git archive --prefix=${ARCHIVE_NAME}/ HEAD + | bzip2 > ${CMAKE_BINARY_DIR}/${ARCHIVE_NAME}.tar.bz2 + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) + +ADD_CUSTOM_TARGET(check + COMMAND ./check/check.sh ./check/list.txt) + + +## Compiler flags +IF (CMAKE_COMPILER_IS_GNUCXX) + list(APPEND CMAKE_CXX_FLAGS " --std=c++11 -O3 -DNDEBUG -Wall -Wextra") + #list(APPEND CMAKE_CXX_FLAGS " -g -Wall -Wextra") +ENDIF() + +SET(WNB_SRCS wnb/core/wordnet.cc + wnb/core/load_wordnet.cc wnb/core/info_helper.cc) + +# Executable +#-------------------------------------------------- +ADD_EXECUTABLE (wntest wnb/main.cc ${WNB_SRCS}) +SET(EXECUTABLE_OUTPUT_PATH ${wnb_BINARY_DIR}/bin) + +# Static library +#-------------------------------------------------- +ADD_LIBRARY(wnb ${WNB_SRCS}) +SET(LIBRARY_OUTPUT_PATH ${wnb_BINARY_DIR}/lib) + +IF (Boost_FOUND) + TARGET_LINK_LIBRARIES(wntest ${Boost_LIBRARIES}) + TARGET_LINK_LIBRARIES(wnb ${Boost_LIBRARIES}) +ENDIF() \ No newline at end of file diff --git a/contrib/wordnet-blast/README b/contrib/wordnet-blast/README new file mode 100644 index 00000000000..32aec5c7900 --- /dev/null +++ b/contrib/wordnet-blast/README @@ -0,0 +1,43 @@ + +===================================================================== + WordNet Blast +===================================================================== + +In memory access to the wordnet onthology. + +DEPENDENCIES: + boost 1.46 + wordnet-sense-index + colordiff (for wntest) + +INSTALL: + cmake CMakeLists.txt + make + +TESTS: (Beta) + make check + +USAGE: + #include "wordnet.hh" + #include "wnb/nltk_similarity.hh" + + using namespace std; + using namespace wnb; + + int main() + { + wordnet wn(PATH_TO_WORDNET); + + vector synsets1 = wn.get_synsets("cat"); + vector synsets2 = wn.get_synsets("dog"); + + nltk_similarity similarity(wn); + float d = similarity(synsets1[0], synsets2[0], 6); + } + +BUGS: + - Word Morphing is sometimes incorrect. + +REFERENCE: + George A. Miller (1995). WordNet: A Lexical Database for English. + Communications of the ACM Vol. 38, No. 11: 39-41. diff --git a/contrib/wordnet-blast/WORDNET_LICENSE b/contrib/wordnet-blast/WORDNET_LICENSE new file mode 100644 index 00000000000..ef9a754631a --- /dev/null +++ b/contrib/wordnet-blast/WORDNET_LICENSE @@ -0,0 +1,25 @@ +This license is available as the file LICENSE in any downloaded version of +WordNet. + +WordNet Release 3.0 + +This software and database is being provided to you, the LICENSEE, by Princeton +University under the following license. By obtaining, using and/or copying this +software and database, you agree that you have read, understood, and will comply +with these terms and conditions.: Permission to use, copy, modify and distribute +this software and database and its documentation for any purpose and without fee +or royalty is hereby granted, provided that you agree to comply with the +following copyright notice and statements, including the disclaimer, and that +the same appear on ALL copies of the software, database and documentation, +including modifications that you make for internal use or for distribution. +WordNet 3.0 Copyright 2006 by Princeton University. All rights reserved. THIS +SOFTWARE AND DATABASE IS PROVIDED "AS IS" AND PRINCETON UNIVERSITY MAKES NO +REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT +LIMITATION, PRINCETON UNIVERSITY MAKES NO REPRESENTATIONS OR WARRANTIES OF +MERCHANT- ABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE +LICENSED SOFTWARE, DATABASE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY +PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. The name of Princeton +University or Princeton may not be used in advertising or publicity pertaining +to distribution of the software and/or database. Title to copyright in this +software, database and any associated documentation shall at all times remain +with Princeton University and LICENSEE agrees to preserve same. diff --git a/contrib/wordnet-blast/changelog b/contrib/wordnet-blast/changelog new file mode 100644 index 00000000000..7bbe4ce1101 --- /dev/null +++ b/contrib/wordnet-blast/changelog @@ -0,0 +1,11 @@ + * 0.6 + - Improve tests + - get_synsets by pos + - Load wordnet a bit faster + - Fix build on Mac Os (thanks to Roman Kutlak) + - Update doc + - Improve testing + * 0.5 + - get_synsets + with morphing partially implemented (thanks to Yaron Feigin) + - sense similarity diff --git a/contrib/wordnet-blast/check/biglist.txt b/contrib/wordnet-blast/check/biglist.txt new file mode 100644 index 00000000000..0aa2125808d --- /dev/null +++ b/contrib/wordnet-blast/check/biglist.txt @@ -0,0 +1,852 @@ +a +able +about +account +acid +across +act +addition +adjustment +advertisement +after +again +against +agreement +air +all +almost +among +amount +amusement +and +angle +angry +animal +answer +ant +any +apparatus +apple +approval +arch +argument +arm +army +art +as +at +attack +attempt +attention +attraction +authority +automatic +awake +baby +back +bad +bag +balance +ball +band +base +basin +basket +bath +be +beautiful +because +bed +bee +before +behaviour +belief +bell +bent +berry +between +bird +birth +bit +bite +bitter +black +blade +blood +blow +blue +board +boat +body +boiling +bone +book +boot +bottle +box +boy +brain +brake +branch +brass +bread +breath +brick +bridge +bright +broken +brother +brown +brush +bucket +building +bulb +burn +burst +business +but +butter +button +by +cake +camera +canvas +card +care +carriage +cart +cat +cause +certain +chain +chalk +chance +change +cheap +cheese +chemical +chest +chief +chin +church +circle +clean +clear +clock +cloth +cloud +coal +coat +cold +collar +colour +comb +come +comfort +committee +common +company +comparison +competition +complete +complex +condition +connection +conscious +control +cook +copper +copy +cord +cork +cotton +cough +country +cover +cow +crack +credit +crime +cruel +crush +cry +cup +cup +current +curtain +curve +cushion +damage +danger +dark +daughter +day +dead +dear +death +debt +decision +deep +degree +delicate +dependent +design +desire +destruction +detail +development +different +digestion +direction +dirty +discovery +discussion +disease +disgust +distance +distribution +division +do +dog +door +doubt +down +drain +drawer +dress +drink +driving +drop +dry +dust +ear +early +earth +east +edge +education +effect +egg +elastic +electric +end +engine +enough +equal +error +even +event +ever +every +example +exchange +existence +expansion +experience +expert +eye +face +fact +fall +false +family +far +farm +fat +father +fear +feather +feeble +feeling +female +fertile +fiction +field +fight +finger +fire +first +fish +fixed +flag +flame +flat +flight +floor +flower +fly +fold +food +foolish +foot +for +force +fork +form +forward +fowl +frame +free +frequent +friend +from +front +fruit +full +future +garden +general +get +girl +give +glass +glove +go +goat +gold +good +government +grain +grass +great +green +grey +grip +group +growth +guide +gun +hair +hammer +hand +hanging +happy +harbour +hard +harmony +hat +hate +have +he +head +healthy +hear +hearing +heart +heat +help +high +history +hole +hollow +hook +hope +horn +horse +hospital +hour +house +how +humour +I +ice +idea +if +ill +important +impulse +in +increase +industry +ink +insect +instrument +insurance +interest +invention +iron +island +jelly +jewel +join +journey +judge +jump +keep +kettle +key +kick +kind +kiss +knee +knife +knot +knowledge +land +language +last +late +laugh +law +lead +leaf +learning +leather +left +leg +let +letter +level +library +lift +light +like +limit +line +linen +lip +liquid +list +little +living +lock +long +look +loose +loss +loud +love +low +machine +make +male +man +manager +map +mark +market +married +mass +match +material +may +meal +measure +meat +medical +meeting +memory +metal +middle +military +milk +mind +mine +minute +mist +mixed +money +monkey +month +moon +morning +mother +motion +mountain +mouth +move +much +muscle +music +nail +name +narrow +nation +natural +near +necessary +neck +need +needle +nerve +net +new +news +night +no +noise +normal +north +nose +not +note +now +number +nut +observation +of +off +offer +office +oil +old +on +only +open +operation +opinion +opposite +or +orange +order +organization +ornament +other +out +oven +over +owner +page +pain +paint +paper +parallel +parcel +part +past +paste +payment +peace +pen +pencil +person +physical +picture +pig +pin +pipe +place +plane +plant +plate +play +please +pleasure +plough +pocket +point +poison +polish +political +poor +porter +position +possible +pot +potato +powder +power +present +price +print +prison +private +probable +process +produce +profit +property +prose +protest +public +pull +pump +punishment +purpose +push +put +quality +question +quick +quiet +quite +rail +rain +range +rat +rate +ray +reaction +reading +ready +reason +receipt +record +red +regret +regular +relation +religion +representative +request +respect +responsible +rest +reward +rhythm +rice +right +ring +river +road +rod +roll +roof +room +root +rough +round +rub +rule +run +sad +safe +sail +salt +same +sand +say +scale +school +science +scissors +screw +sea +seat +second +secret +secretary +see +seed +seem +selection +self +send +sense +separate +serious +servant +sex +shade +shake +shame +sharp +sheep +shelf +ship +shirt +shock +shoe +short +shut +side +sign +silk +silver +simple +sister +size +skin + +skirt +sky +sleep +slip +slope +slow +small +smash +smell +smile +smoke +smooth +snake +sneeze +snow +so +soap +society +sock +soft +solid +some + +son +song +sort +sound +soup +south +space +spade +special +sponge +spoon +spring +square +stage +stamp +star +start +statement +station +steam +steel +stem +step +stick +sticky +stiff +still +stitch +stocking +stomach +stone +stop +store +story +straight +strange +street +stretch +strong +structure +substance +such +sudden +sugar +suggestion +summer +sun +support +surprise +sweet +swim +system +table +tail +take +talk +tall +taste +tax +teaching +tendency +test +than +that +the +then +theory +there +thick +thin +thing +this +thought +thread +throat +through +through +thumb +thunder +ticket +tight +till +time +tin +tired +to +toe +together +tomorrow +tongue +tooth +top +touch +town +trade +train +transport +tray +tree +trick +trouble +trousers +true +turn +twist +umbrella +under +unit +up +use +value +verse +very +vessel +view +violent +voice +waiting +walk +wall +war +warm +wash +waste +watch +water +wave +wax +way +weather +week +weight +well +west +wet +wheel +when +where +while +whip +whistle +white +who +why +wide +will +wind +window +wine +wing +winter +wire +wise +with +woman +wood +wool +word +work +worm +wound +writing +wrong +year +yellow +yes +yesterday +you +young \ No newline at end of file diff --git a/contrib/wordnet-blast/check/check.sh b/contrib/wordnet-blast/check/check.sh new file mode 100644 index 00000000000..cc8b6ee5ad7 --- /dev/null +++ b/contrib/wordnet-blast/check/check.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +WNHOME=/usr/share/wordnet/ + +check() { + local word_list="$1" + echo "./bin/wntest $WNHOME ${word_list}" + time ./bin/wntest $WNHOME ${word_list} > ${word_list}.blast + echo "for i in \`cat ${word_list}\`; do wn $i -over; done" + time for i in `cat ${word_list}`; do wn $i -over; done > ${word_list}.wn + + echo "diff ${word_list}.wn ${word_list}.blast -b" + colordiff -y ${word_list}.wn ${word_list}.blast -b +} + +check "$1" \ No newline at end of file diff --git a/contrib/wordnet-blast/check/list.txt b/contrib/wordnet-blast/check/list.txt new file mode 100644 index 00000000000..71189cb8af1 --- /dev/null +++ b/contrib/wordnet-blast/check/list.txt @@ -0,0 +1,7 @@ +cat +lions +city +building +salvation +medications +haven diff --git a/contrib/wordnet-blast/wnb/bfs.hh b/contrib/wordnet-blast/wnb/bfs.hh new file mode 100644 index 00000000000..c448ffbb79e --- /dev/null +++ b/contrib/wordnet-blast/wnb/bfs.hh @@ -0,0 +1,72 @@ +#ifndef _BFS_HH +# define _BFS_HH + +# include +# include + +namespace wnb +{ + struct synset; + + namespace bfs // breadth first search tools + { + /// bfs_visitor + /// Sum distances and throw answer if target synset found + template + class distance_recorder : public boost::default_bfs_visitor + { + public: + distance_recorder(DistanceMap dist, const synset& s, int max) + : d(dist), target(s), max_length(max) + { } + + template + void tree_edge(Edge e, const Graph& g) const + { + typename boost::graph_traits::vertex_descriptor + u = boost::source(e, g), v = boost::target(e, g); + d[v] = d[u] + 1; + + if (g[v] == target) + throw d[v]; + if (d[v] > max_length) + throw -1; + } + private: + DistanceMap d; + const synset& target; + int max_length; + }; + + /// Convenience function + template + distance_recorder + record_distance(DistanceMap d, const synset& s, int m) + { + return distance_recorder(d, s, m); + } + + /// This predicate function object determines which edges of the original + /// graph will show up in the filtered graph. + //FIXME: Do we really need a map here (check cost of property_map construction + // / should be light) + template + struct hypo_hyper_edge { + hypo_hyper_edge() { } + hypo_hyper_edge(PointerSymbolMap pointer_symbol) + : m_pointer_symbol(pointer_symbol) { } + template + bool operator()(const Edge& e) const { + int p_s = get(m_pointer_symbol, e); + //see pointer symbol list in info_helper.hh + return p_s == 1 || p_s == 2 || p_s == 3 || p_s == 4; + } + PointerSymbolMap m_pointer_symbol; + }; + + } // end of wnb::bfs + +} // end of namespace wnb + +#endif /* _BFS_HH */ + diff --git a/contrib/wordnet-blast/wnb/core/info_helper.cc b/contrib/wordnet-blast/wnb/core/info_helper.cc new file mode 100644 index 00000000000..605704b796d --- /dev/null +++ b/contrib/wordnet-blast/wnb/core/info_helper.cc @@ -0,0 +1,148 @@ +#include "info_helper.hh" + +#include +#include +#include +#include + +#include + +namespace wnb +{ + + // Class info_helper + + /// List of pointer symbols + const char * + info_helper::symbols[info_helper::NB_SYMBOLS] = { + "!" , // 0 Antonym + "@" , // 1 Hypernym + "@i", // 2 Instance Hypernym + "~" , // 3 Hyponym + "~i", // 4 Instance Hyponym + "#m", // 5 Member holonym + "#s", // 6 Substance holonym + "#p", // 7 Part holonym + "%m", // 8 Member meronym + "%s", // 9 Substance meronym + "%p", // 10 Part meronym + "=" , // 11 Attribute + "+" , // 12 Derivationally related form + ";c", // 13 Domain of synset - TOPIC + "-c", // 14 Member of this domain - TOPIC + ";r", // 15 Domain of synset - REGION + "-r", // 16 Member of this domain - REGION + ";u", // 17 Domain of synset - USAGE + "-u", // 18 Member of this domain - USAGE + + //The pointer_symbol s for verbs are: + "*", // 19 Entailment + ">", // 20 Cause + "^", // 21 Also see + "$", // 22 Verb Group + + //The pointer_symbol s for adjectives are: + "&", // 23 Similar to + "<", // 24 Participle of verb + "\\", // 25 Pertainym (pertains to noun) + "=", // 26 Attribute + }; + + const std::string info_helper::sufx[] = { + /* Noun suffixes */ + "s", "ses", "xes", "zes", "ches", "shes", "men", "ies", + /* Verb suffixes */ + "s", "ies", "es", "es", "ed", "ed", "ing", "ing", + /* Adjective suffixes */ + "er", "est", "er", "est" + }; + + const std::string info_helper::addr[] = { + /* Noun endings */ + "", "s", "x", "z", "ch", "sh", "man", "y", + /* Verb endings */ + "", "y", "e", "", "e", "", "e", "", + /* Adjective endings */ + "", "", "e", "e" + }; + + const int info_helper::offsets[info_helper::NUMPARTS] = { 0, 0, 8, 16, 0, 0 }; + const int info_helper::cnts[info_helper::NUMPARTS] = { 0, 8, 8, 4, 0, 0 }; + + void + info_helper::update_pos_maps() + { + // http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3 + + indice_offset[UNKNOWN] = 0; + + indice_offset[N] = 0; + indice_offset[V] = indice_offset[N] + pos_maps[N].size(); + indice_offset[A] = indice_offset[V] + pos_maps[V].size(); + indice_offset[R] = indice_offset[A] + pos_maps[A].size(); + indice_offset[S] = indice_offset[R] + pos_maps[R].size(); + + } + + int info_helper::compute_indice(int offset, pos_t pos) + { + if (pos == S) + pos = A; + std::map& map = pos_maps[pos]; + + assert(pos <= 5 && pos > 0); + + return indice_offset[pos] + map[offset]; + } + + // Function definitions + + // Return relation between synset indices and offsets + static + std::map + preprocess_data(const std::string& fn) + { + std::map map; + std::ifstream file(fn.c_str()); + if (!file.is_open()) + throw std::runtime_error("preprocess_data: File not found: " + fn); + + std::string row; + + //skip header + const unsigned int header_nb_lines = 29; + for(std::size_t i = 0; i < header_nb_lines; i++) + std::getline(file, row); + + int ind = 0; + //parse data line + while (std::getline(file, row)) + { + std::stringstream srow(row); + int offset; + srow >> offset; + map.insert(std::pair(offset, ind)); + ind++; + } + + file.close(); + return map; + } + + info_helper + preprocess_wordnet(const std::string& dn) + { + info_helper info; + + info.pos_maps[N] = preprocess_data((dn + "data.noun")); // noun_map + info.pos_maps[V] = preprocess_data((dn + "data.verb")); // verb_map + info.pos_maps[A] = preprocess_data((dn + "data.adj")); // adj_map + info.pos_maps[R] = preprocess_data((dn + "data.adv")); // adv_map + + info.update_pos_maps(); + + return info; + } + +} // end of namespace wnb + diff --git a/contrib/wordnet-blast/wnb/core/info_helper.hh b/contrib/wordnet-blast/wnb/core/info_helper.hh new file mode 100644 index 00000000000..92a3dee5781 --- /dev/null +++ b/contrib/wordnet-blast/wnb/core/info_helper.hh @@ -0,0 +1,85 @@ +#pragma once + +# include +# include +# include + +# include "pos_t.hh" + +namespace wnb +{ + + /// Useful information for wordnet in-memory import + struct info_helper + { + /// Symbols' size + static const std::size_t NB_SYMBOLS = 27; + static const std::size_t NUMPARTS = POS_ARRAY_SIZE; + + /// List of pointer symbols + static const char * symbols[NB_SYMBOLS]; + static const std::string sufx[]; + static const std::string addr[]; + + static const int offsets[NUMPARTS]; + static const int cnts[NUMPARTS]; + + typedef std::map i2of_t; ///< indice/offset correspondences + typedef std::map pos_i2of_t; ///< pos / map correspondences + + /// Constructor + info_helper() { update_pos_maps(); } + + /// Compute the number of synsets (i.e. the number of vertex in the graph) + unsigned nb_synsets() + { + typedef pos_i2of_t::iterator iter_t; + + int sum = 0; + for (iter_t it = pos_maps.begin(); it != pos_maps.end(); it++) + sum += (*it).second.size(); + + return sum; + //return adj_map.size() + adv_map.size() + noun_map.size() + verb_map.size(); + } + + // Given a pos return the starting indice in the graph + int get_indice_offset(pos_t pos) + { + return indice_offset[pos]; + } + + /// Helper function computing global indice in graph from local offset + int compute_indice(int offset, pos_t pos); + + /// Update a map allowing one to get the correct map given a pos + void update_pos_maps(); + + int get_symbol(const std::string& ps) + { + for (unsigned i = 0; i < NB_SYMBOLS; i++) + if (ps == symbols[i]) + return i; + throw std::runtime_error("Symbol NOT FOUND."); + } + + pos_t get_pos(const char& c) + { + return get_pos_from_char(c); + } + + public: + + // i2of_t adj_map; + // i2of_t adv_map; + // i2of_t noun_map; + // i2of_t verb_map; + + pos_i2of_t pos_maps; + std::size_t indice_offset[POS_ARRAY_SIZE]; + }; + + /// Create a new info_help based on wordnet data located in dn (../dict/) + info_helper preprocess_wordnet(const std::string& dn); + +} // end of namespace wncpp diff --git a/contrib/wordnet-blast/wnb/core/load_wordnet.cc b/contrib/wordnet-blast/wnb/core/load_wordnet.cc new file mode 100644 index 00000000000..e99b44b3b9f --- /dev/null +++ b/contrib/wordnet-blast/wnb/core/load_wordnet.cc @@ -0,0 +1,381 @@ +#include "load_wordnet.hh" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "wordnet.hh" +#include "info_helper.hh" +#include "pos_t.hh" + +namespace bg = boost::graph; + +namespace wnb +{ + + namespace + { + + // Load synset's words + void load_data_row_words(std::stringstream& srow, synset& synset) + { + srow >> std::hex >> synset.w_cnt >> std::dec; + for (std::size_t i = 0; i < synset.w_cnt; i++) + { + //word lex_id + + std::string word; + srow >> word; + synset.words.push_back(word); + + int lex_id; + srow >> std::hex >> lex_id >> std::dec; + synset.lex_ids.push_back(lex_id); + } + } + + // Add rel to graph + void add_wordnet_rel(std::string& pointer_symbol_,// type of relation + int synset_offset, // dest offset + pos_t pos, // p.o.s. of dest + int src, // word src + int trgt, // word target + synset& synset, // source synset + wordnet& wn, // our wordnet + info_helper& info) // helper + { + //if (pos == S || synset.pos == S) + // return; //FIXME: check where are s synsets. + + int u = synset.id; + int v = info.compute_indice(synset_offset, pos); + + ptr p; + p.pointer_symbol = info.get_symbol(pointer_symbol_); + p.source = src; + p.target = trgt; + + boost::add_edge(u,v, p, wn.wordnet_graph); + } + + + // load ptrs + void load_data_row_ptrs(std::stringstream& srow, synset& synset, + wordnet& wn, info_helper& info) + { + srow >> synset.p_cnt; + for (std::size_t i = 0; i < synset.p_cnt; i++) + { + //http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3 + //pointer_symbol synset_offset pos source/target + std::string pointer_symbol_; + int synset_offset; + pos_t pos; + int src; + int trgt; + + srow >> pointer_symbol_; + srow >> synset_offset; + + char c; + srow >> c; + pos = info.get_pos(c); + + //print extracted edges + //std::cout << "(" << pointer_symbol << ", " << synset_offset; + //std::cout << ", " << pos << ")" << std::endl; + + // Extract source/target words info + std::string src_trgt; + srow >> src_trgt; + std::stringstream ssrc(std::string(src_trgt,0,2)); + std::stringstream strgt(std::string(src_trgt,2,2)); + ssrc >> std::hex >> src >> std::dec; + strgt >> std::hex >> trgt >> std::dec; + + add_wordnet_rel(pointer_symbol_, synset_offset, pos, src, trgt, synset, wn, info); + } + } + + + // Load a synset and add it to the wordnet class. + void load_data_row(const std::string& row, wordnet& wn, info_helper& info) + { + //http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html#sect3 + // synset_offset lex_filenum ss_type w_cnt word lex_id [word lex_id...] p_cnt [ptr...] [frames...] | gloss + synset synset; + + std::stringstream srow(row); + int synset_offset; + srow >> synset_offset; + srow >> synset.lex_filenum; + char ss_type; + srow >> ss_type; + + // extra information + synset.pos = info.get_pos(ss_type); + synset.id = info.compute_indice(synset_offset, synset.pos); + + // words + load_data_row_words(srow, synset); + + // ptrs + load_data_row_ptrs(srow, synset, wn, info); + + //frames (skipped) + std::string tmp; + while (srow >> tmp) + if (tmp == "|") + break; + + // gloss + std::getline(srow, synset.gloss); + + // extra + synset.sense_number = 0; + + // Add synset to graph + wn.wordnet_graph[synset.id] = synset; + } + + + // Parse data.noun files + void load_wordnet_data(const std::string& fn, wordnet& wn, info_helper& info) + { + std::ifstream fin(fn.c_str()); + if (!fin.is_open()) + throw std::runtime_error("File missing: " + fn); + + static const int MAX_LENGTH = 20480; + char row[MAX_LENGTH]; + + //skip header + for(unsigned i = 0; i < 29; i++) + fin.getline(row, MAX_LENGTH); + + //parse data line + while (fin.getline(row, MAX_LENGTH)) + load_data_row(row, wn, info); + + fin.close(); + } + + + //FIXME: It seems possible to replace synset_offsets with indice here. + void load_index_row(const std::string& row, wordnet& wn, info_helper& info) + { + // lemma pos synset_cnt p_cnt [ptr_symbol...] sense_cnt tagsense_cnt synset_offset [synset_offset...] + index index; + std::stringstream srow(row); + + char pos; + srow >> index.lemma; + srow >> pos; + index.pos = info.get_pos(pos); // extra data + srow >> index.synset_cnt; + srow >> index.p_cnt; + + std::string tmp_p; + for (std::size_t i = 0; i < index.p_cnt; i++) + { + srow >> tmp_p; + index.ptr_symbols.push_back(tmp_p); + } + srow >> index.sense_cnt; + srow >> index.tagsense_cnt; + + int tmp_o; + while (srow >> tmp_o) + { + index.synset_offsets.push_back(tmp_o); + index.synset_ids.push_back(info.compute_indice(tmp_o, index.pos)); // extra data + } + + //add synset to index list + wn.index_list.push_back(index); + } + + + void load_wordnet_index(const std::string& fn, wordnet& wn, info_helper& info) + { + std::ifstream fin(fn.c_str()); + if (!fin.is_open()) + throw std::runtime_error("File Not Found: " + fn); + + static const int MAX_LENGTH = 20480; + char row[MAX_LENGTH]; + + //skip header + const unsigned int header_nb_lines = 29; + for(std::size_t i = 0; i < header_nb_lines; i++) + fin.getline(row, MAX_LENGTH); + + //parse data line + while (fin.getline(row, MAX_LENGTH)) + load_index_row(row, wn, info); + + fin.close(); + } + + + void load_wordnet_exc(const std::string& dn, std::string cat, + wordnet& wn, info_helper&) + { + std::string fn = dn + cat + ".exc"; + std::ifstream fin(fn.c_str()); + if (!fin.is_open()) + throw std::runtime_error("File Not Found: " + fn); + + std::map& exc = wn.exc[get_pos_from_name(cat)]; + + std::string row; + + std::string key, value; + while (std::getline(fin, row)) + { + std::stringstream srow(row); + srow >> key; + srow >> value; + + exc[key] = value; + } + } + + void load_wordnet_cat(const std::string dn, std::string cat, + wordnet& wn, info_helper& info) + { + load_wordnet_data((dn + "data." + cat), wn, info); + load_wordnet_index((dn + "index." + cat), wn, info); + load_wordnet_exc(dn, cat, wn, info); + } + + // FIXME: this file is not in all packaged version of wordnet + void load_wordnet_index_sense(const std::string& dn, wordnet& wn, info_helper& info) + { + std::string fn = dn + "index.sense"; + std::ifstream fin(fn.c_str()); + if (!fin.is_open()) + throw std::runtime_error("File Not Found: " + fn); + + std::string row; + std::string sense_key; + int synset_offset; + while (std::getline(fin, row)) + { + std::stringstream srow(row); + srow >> sense_key; + + // Get the pos of the lemma + std::vector sk = ext::split(sense_key,'%'); + std::string word = sk.at(0); + std::stringstream tmp(ext::split(sk.at(1), ':').at(0)); + int ss_type; + tmp >> ss_type; + pos_t pos = (pos_t) ss_type; + + srow >> synset_offset; + + // Update synset info + int u = info.compute_indice(synset_offset, pos); + int sense_number; + srow >> sense_number; + wn.wordnet_graph[u].sense_number += sense_number; + int tag_cnt; + srow >> tag_cnt; + if (tag_cnt != 0) + wn.wordnet_graph[u].tag_cnts.push_back( make_pair(word,tag_cnt) ); + + //if (synset_offset == 2121620) + // std::cout << u << " " << word << " " << synset_offset << " " + // << wn.wordnet_graph[u].tag_cnt << " " + // << wn.wordnet_graph[u].words[0] << std::endl; + } + } + + // wn -over used info in cntlist even if this is deprecated + // It is ok not to FIX and use this function + void load_wordnet_cntlist(const std::string& dn, wordnet& wn, info_helper& info) + { + std::string fn = dn + "cntlist"; + std::ifstream fin(fn.c_str()); + if (!fin.is_open()) + throw std::runtime_error("File Not Found: " + fn); + + std::string sense_key; + int sense_number; + int tag_cnt; + + std::string row; + while (std::getline(fin, row)) + { + std::stringstream srow(row); + + srow >> sense_key; + srow >> sense_number; + srow >> tag_cnt; + + // Get the pos of the lemma + std::string word = ext::split(sense_key,'%').at(0); + std::stringstream tmp(ext::split(ext::split(sense_key,'%').at(1), ':').at(0)); + int ss_type; + tmp >> ss_type; + pos_t pos = (pos_t) ss_type; + + // Update synset info + int synset_offset; // FIXME + int u = info.compute_indice(synset_offset, pos); + wn.wordnet_graph[u].sense_number += sense_number; + if (tag_cnt != 0) + wn.wordnet_graph[u].tag_cnts.push_back( make_pair(word,tag_cnt) ); + } + } + + } // end of anonymous namespace + + void load_wordnet(const std::string& dn, wordnet& wn, info_helper& info) + { + // vertex added in this order a n r v + + std::string fn = dn; + + if (wn._verbose) + { + std::cout << std::endl; + std::cout << "### Loading Wordnet 3.0"; + boost::progress_display show_progress(5); + boost::progress_timer t; + + load_wordnet_cat(dn, "adj", wn, info); + ++show_progress; + load_wordnet_cat(dn, "noun", wn, info); + ++show_progress; + load_wordnet_cat(dn, "adv", wn, info); + ++show_progress; + load_wordnet_cat(dn, "verb", wn, info); + ++show_progress; + load_wordnet_index_sense(dn, wn, info); + ++show_progress; + std::cout << std::endl; + } + else + { + load_wordnet_cat(dn, "adj", wn, info); + load_wordnet_cat(dn, "noun", wn, info); + load_wordnet_cat(dn, "adv", wn, info); + load_wordnet_cat(dn, "verb", wn, info); + load_wordnet_index_sense(dn, wn, info); + } + + std::stable_sort(wn.index_list.begin(), wn.index_list.end()); + } + +} // end of namespace wnb diff --git a/contrib/wordnet-blast/wnb/core/load_wordnet.hh b/contrib/wordnet-blast/wnb/core/load_wordnet.hh new file mode 100644 index 00000000000..ef23e111d1f --- /dev/null +++ b/contrib/wordnet-blast/wnb/core/load_wordnet.hh @@ -0,0 +1,12 @@ +#pragma once + +# include "info_helper.hh" + +namespace wnb +{ + /// forward declaration + struct wordnet; + + /// Load the entire wordnet data base located in \p dn (typically .../dict/) + void load_wordnet(const std::string& dn, wordnet& wn, info_helper& info); +} diff --git a/contrib/wordnet-blast/wnb/core/pos_t.hh b/contrib/wordnet-blast/wnb/core/pos_t.hh new file mode 100644 index 00000000000..18eac3b6131 --- /dev/null +++ b/contrib/wordnet-blast/wnb/core/pos_t.hh @@ -0,0 +1,61 @@ +#pragma once + +namespace wnb +{ + + static const std::size_t POS_ARRAY_SIZE = 6; + static const char POS_ARRAY[POS_ARRAY_SIZE] = {'u', 'n', 'v', 'a', 'r', 's'}; + + enum pos_t + { + UNKNOWN = 0, + N = 1, + V = 2, + A = 3, + R = 4, + S = 5, + }; + + + inline pos_t get_pos_from_name(const std::string& pos) + { + if (pos == "adj") + return A; + if (pos == "noun") + return N; + if (pos == "adv") + return R; + if (pos == "verb") + return V; + if (pos == "adj sat") + return S; + return UNKNOWN; + } + + inline std::string get_name_from_pos(const pos_t& pos) + { + switch (pos) + { + case A: return "adj"; + case N: return "noun"; + case R: return "adv"; + case V: return "verb"; + case S: return "adj sat"; + default: return "UNKNOWN"; + } + } + + inline pos_t get_pos_from_char(const char& c) + { + switch (c) + { + case 'a': return A; + case 'n': return N; + case 'r': return R; + case 'v': return V; + case 's': return S; + default: return UNKNOWN; + } + } + +} // end of namespace wncpp diff --git a/contrib/wordnet-blast/wnb/core/wordnet.cc b/contrib/wordnet-blast/wnb/core/wordnet.cc new file mode 100644 index 00000000000..ee7a1548362 --- /dev/null +++ b/contrib/wordnet-blast/wnb/core/wordnet.cc @@ -0,0 +1,186 @@ +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace wnb +{ + + //FIXME: Make (smart) use of fs::path + wordnet::wordnet(const std::string& wordnet_dir, bool verbose) + : _verbose(verbose) + { + if (_verbose) + { + std::cout << wordnet_dir << std::endl; + } + + info = preprocess_wordnet(wordnet_dir); + + wordnet_graph = graph(info.nb_synsets()); + load_wordnet(wordnet_dir, *this, info); + + if (_verbose) + { + std::cout << "nb_synsets: " << info.nb_synsets() << std::endl; + } + //FIXME: this check is only valid for Wordnet 3.0 + //assert(info.nb_synsets() == 142335);//117659); + assert(info.nb_synsets() > 0); + } + + std::vector + wordnet::get_synsets(const std::string& word, pos_t pos) + { + std::vector synsets; + + // morphing + std::string mword = morphword(word, pos); + if (mword == "") + return synsets; + + // binary_search + typedef std::vector vi; + std::pair bounds = get_indexes(mword); + + vi::iterator it; + for (it = bounds.first; it != bounds.second; it++) + { + if (pos == pos_t::UNKNOWN || it->pos == pos) + { + for (std::size_t i = 0; i < it->synset_ids.size(); i++) + { + int id = it->synset_ids[i]; + synsets.push_back(wordnet_graph[id]); + } + } + } + + return synsets; + } + + const std::vector * + wordnet::get_synset(const std::string& word, pos_t pos) const { + + typedef std::vector vi; + std::pair bounds = get_indexes_const(word); + + for (vi::const_iterator it = bounds.first; it != bounds.second; it++) + { + if (pos == pos_t::UNKNOWN || it->pos == pos) + { + int id = it->synset_ids[0]; + return &wordnet_graph[id].words; + } + } + return nullptr; + } + + std::pair::const_iterator, std::vector::const_iterator> + wordnet::get_indexes_const(const std::string& word) const + { + index light_index; + light_index.lemma = word; + + typedef std::vector vi; + std::pair bounds = + std::equal_range(index_list.begin(), index_list.end(), light_index); + + return bounds; + } + + std::pair::iterator, std::vector::iterator> + wordnet::get_indexes(const std::string& word) + { + index light_index; + light_index.lemma = word; + + typedef std::vector vi; + std::pair bounds = + std::equal_range(index_list.begin(), index_list.end(), light_index); + + return bounds; + } + + std::string + wordnet::wordbase(const std::string& word, int ender) + { + if (ext::ends_with(word, info.sufx[ender])) + { + int sufxlen = info.sufx[ender].size(); + std::string strOut = word.substr(0, word.size() - sufxlen); + if (!info.addr[ender].empty()) + strOut += info.addr[ender]; + return strOut; + } + return word; + } + + bool is_defined(const std::string& word, pos_t pos) + { + // hack FIXME: Some verbs are built with -e suffix ('builde' is just an example). + if (pos == V && word == "builde") + return false; + return true; + } + + // Try to find baseform (lemma) of individual word in POS + std::string + wordnet::morphword(const std::string& word, pos_t pos) + { + // first look for word on exception list + exc_t::iterator it = exc[pos].find(word); + if (it != exc[pos].end()) + return it->second; // found in exception list + + std::string tmpbuf; + std::string end; + int cnt = 0; + + if (pos == R) + return ""; // Only use exception list for adverbs + + if (pos == N) + { + if (ext::ends_with(word, "ful")) + { + cnt = word.size() - 3; + tmpbuf = word.substr(0, cnt); + end = "ful"; + } + else + { + // check for noun ending with 'ss' or short words + if (ext::ends_with(word, "ss") || word.size() <= 2) + return ""; + } + } + + // If not in exception list, try applying rules from tables + + if (tmpbuf.size() == 0) + tmpbuf = word; + + if (pos != pos_t::UNKNOWN) + { + int offset = info.offsets[pos]; + int pos_cnt = info.cnts[pos]; + + std::string morphed; + for (int i = 0; i < pos_cnt; i++) + { + morphed = wordbase(tmpbuf, (i + offset)); + if (morphed != tmpbuf && is_defined(morphed, pos)) + return morphed + end; + } + return morphed; + } + return word; + } + +} // end of namespace wnb diff --git a/contrib/wordnet-blast/wnb/core/wordnet.hh b/contrib/wordnet-blast/wnb/core/wordnet.hh new file mode 100644 index 00000000000..b5437db464c --- /dev/null +++ b/contrib/wordnet-blast/wnb/core/wordnet.hh @@ -0,0 +1,113 @@ +#pragma once + +# include +# include +# include +# include +//# include + +//Possible https://bugs.launchpad.net/ubuntu/+source/boost/+bug/270873 +# include +# include + +# include "load_wordnet.hh" +# include "pos_t.hh" + +namespace wnb +{ + + /// More info here: http://wordnet.princeton.edu/wordnet/man/wndb.5WN.html + + struct info_helper; + + /// Synset + struct synset + { + int lex_filenum; + std::size_t w_cnt; + std::vector words; + std::vector lex_ids; + std::size_t p_cnt; + std::string gloss; + + // extra + pos_t pos; ///< pos (replace ss_type) + int id; ///< unique identifier (replace synset_offset) + int sense_number; ///< http://wordnet.princeton.edu/man/senseidx.5WN.html + std::vector > tag_cnts; ///< http://wordnet.princeton.edu/man/senseidx.5WN.html + + bool operator==(const synset& s) const { return (id == s.id); } + bool operator<(const synset& s) const { return (id < s.id); } + }; + + + /// Rel between synsets properties + struct ptr + { + //std::string pointer_symbol; ///< symbol of the relation + int pointer_symbol; + int source; ///< source word inside synset + int target; ///< target word inside synset + }; + + + /// Index + struct index + { + std::string lemma; + + std::size_t synset_cnt; + std::size_t p_cnt; + std::size_t sense_cnt; + float tagsense_cnt; + std::vector ptr_symbols; + std::vector synset_offsets; + + // extra + std::vector synset_ids; + pos_t pos; + + bool operator<(const index& b) const + { + return (lemma.compare(b.lemma) < 0); + } + }; + + + /// Wordnet interface class + struct wordnet + { + typedef boost::adjacency_list graph; ///< boost graph type + + /// Constructor + wordnet(const std::string& wordnet_dir, bool verbose=false); + + /// Return synsets matching word + std::vector get_synsets(const std::string& word, pos_t pos = pos_t::UNKNOWN); + //FIXME: todo + std::vector get_synset(const std::string& word, char pos, int i); + // added + const std::vector * get_synset(const std::string& word, pos_t pos = pos_t::UNKNOWN) const; + + std::pair::iterator, std::vector::iterator> + get_indexes(const std::string& word); + + std::pair::const_iterator, std::vector::const_iterator> + get_indexes_const(const std::string& word) const; + + std::string wordbase(const std::string& word, int ender); + + std::string morphword(const std::string& word, pos_t pos); + + std::vector index_list; ///< index list // FIXME: use a map + graph wordnet_graph; ///< synsets graph + info_helper info; ///< helper object + bool _verbose; + + typedef std::map exc_t; + std::map exc; + }; + +} // end of namespace wnb diff --git a/contrib/wordnet-blast/wnb/main.cc b/contrib/wordnet-blast/wnb/main.cc new file mode 100644 index 00000000000..4041464c430 --- /dev/null +++ b/contrib/wordnet-blast/wnb/main.cc @@ -0,0 +1,180 @@ + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +using namespace wnb; +using namespace boost; +using namespace boost::algorithm; + +bool usage(int argc, char ** argv) +{ + std::string dir; + if (argc >= 2) + dir = std::string(argv[1]); + if (argc != 3 || dir[dir.length()-1] != '/') + { + std::cout << argv[0] << " .../wordnet_dir/ word_list_file" << std::endl; + return true; + } + return false; +} + +struct ws +{ + std::string w; + float s; + + bool operator<(const ws& a) const {return s > a.s;} +}; + + +/// Compute similarity of word with words in word list +std::vector +compute_similarities(wordnet& wn, + const std::string& word, + const std::vector& word_list) +{ + std::vector wslist; + std::vector synsets1 = wn.get_synsets(word); + + for (unsigned i = 0; i < synsets1.size(); i++) + for (unsigned k = 0; k < synsets1[i].words.size(); k++) + std::cout << " - " << synsets1[i].words[k] << std::endl; + + nltk_similarity path_similarity(wn); + { + progress_timer t; + progress_display show_progress(word_list.size()); + + for (unsigned k = 0; k < word_list.size(); k++) + { + const std::string& w = word_list[k]; + float max = 0; + std::vector synsets2 = wn.get_synsets(w); + for (unsigned i = 0; i < synsets1.size(); i++) + { + for (unsigned j = 0; j < synsets2.size(); j++) + { + float s = path_similarity(synsets1[i], synsets2[j], 6); + if (s > max) + max = s; + } + } + ws e = {w, max}; + wslist.push_back(e); + ++show_progress; + } + } + + return wslist; +} + +void similarity_test(wordnet& wn, + const std::string& word, + std::vector& word_list) +{ + std::vector wslist = compute_similarities(wn, word, word_list); + + std::stable_sort(wslist.begin(), wslist.end()); + for (unsigned i = 0; i < std::min(wslist.size(), size_t(10)); i++) + std::cout << wslist[i].w << " " << wslist[i].s << std::endl; +} + +void print_synsets(pos_t pos, wnb::index& idx, wordnet& wn) +{ + std::string& mword = idx.lemma; + std::cout << "\nOverview of " << get_name_from_pos(pos) << " " << mword << "\n\n"; + std::cout << "The " << get_name_from_pos(pos) << " " << mword << " has " + << idx.synset_ids.size() << ((idx.synset_ids.size() == 1) ? " sense": " senses"); + + if (idx.tagsense_cnt != 0) + std::cout << " (first " << idx.tagsense_cnt << " from tagged texts)"; + else + std::cout << " (no senses from tagged texts)"; + + std::cout << "\n"; + std::cout << " \n"; + + for (std::size_t i = 0; i < idx.synset_ids.size(); i++) + { + int id = idx.synset_ids[i]; + const synset& synset = wn.wordnet_graph[id]; + + std::cout << i+1 << ". "; + for (std::size_t k = 0; k < synset.tag_cnts.size(); k++) + { + if (synset.tag_cnts[k].first == mword) + std::cout << "(" << synset.tag_cnts[k].second << ") "; + } + + std::vector nwords; + for (auto& w : synset.words) + nwords.push_back((pos == A) ? w.substr(0, w.find_first_of("(")) : w); + + std::cout << replace_all_copy(join(nwords, ", "), "_", " "); + std::cout << " -- (" << trim_copy(synset.gloss) << ")"; + std::cout << std::endl; + } +} + +void wn_like(wordnet& wn, const std::string& word, pos_t pos) +{ + if (word == "") + return; + + typedef std::vector vi; + std::pair bounds = wn.get_indexes(word); + + for (vi::iterator it = bounds.first; it != bounds.second; it++) + { + if (pos != -1 && it->pos == pos) + { + print_synsets(pos, *it, wn); + } + } +} + +void batch_test(wordnet& wn, std::vector& word_list) +{ + for (std::size_t i = 0; i < word_list.size(); i++) + { + for (unsigned p = 1; p < POS_ARRAY_SIZE; p++) + { + pos_t pos = (pos_t) p; + + wn_like(wn, word_list[i], pos); + std::string mword = wn.morphword(word_list[i], pos); + if (mword != word_list[i]) + wn_like(wn, mword, pos); + } + } +} + +int main(int argc, char ** argv) +{ + if (usage(argc, argv)) + return 1; + + // read command line + std::string wordnet_dir = argv[1]; + std::string test_file = argv[2]; + + wordnet wn(wordnet_dir); + + // read test file + std::string list = ext::read_file(test_file); + std::vector wl = ext::split(list); + + batch_test(wn, wl); +} + diff --git a/contrib/wordnet-blast/wnb/nltk_similarity.hh b/contrib/wordnet-blast/wnb/nltk_similarity.hh new file mode 100644 index 00000000000..f7256b366bc --- /dev/null +++ b/contrib/wordnet-blast/wnb/nltk_similarity.hh @@ -0,0 +1,146 @@ +#ifndef _NLTK_SIMILARITY_HH +# define _NLTK_SIMILARITY_HH + +# include +# include +# include + +namespace wnb +{ + namespace internal + { + + //Helper class filtering out other than hypernym relations + template + struct hyper_edge + { + hyper_edge() { } + + hyper_edge(PointerSymbolMap pointer_symbol) + : m_pointer_symbol(pointer_symbol) { } + + template + bool operator()(const Edge& e) const + { + int p_s = get(m_pointer_symbol, e); + return p_s == 1; // hypernyme (instance_hypernyme not used here) + } + + PointerSymbolMap m_pointer_symbol; + }; + + } // end of anonymous namespace + + + class nltk_similarity + { + + typedef boost::property_map::type PointerSymbolMap; + typedef boost::filtered_graph > G; + typedef boost::graph_traits::vertex_descriptor vertex; + + internal::hyper_edge filter; + G fg; + + public: + + nltk_similarity(wordnet& wn) + : filter(get(&ptr::pointer_symbol, wn.wordnet_graph)), + fg(wn.wordnet_graph, filter) + { } + + /// Get list of hypernyms of s along with distance to s + std::map hypernym_map(vertex s); + + /// Get shortest path between and synset1 and synset2. + int shortest_path_distance(const synset& synset1, const synset& synset2); + + /// return disance + float operator()(const synset& synset1, const synset& synset2, int=0); + + }; + + std::map + nltk_similarity::hypernym_map(nltk_similarity::vertex s) + { + std::map map; + + // Python: + // for (hypernym in self[HYPERNYM]) + // distances |= hypernym.hypernym_distances(distance+1); + + boost::graph_traits::out_edge_iterator e, e_end; + std::queue q; + + q.push(s); + map[s] = 0; + while (!q.empty()) + { + vertex u = q.front(); q.pop(); + + int new_d = map[u] + 1; + for (boost::tuples::tie(e, e_end) = out_edges(u, fg); e != e_end; ++e) + { + vertex v = target(*e,fg); + q.push(v); + + if (map.find(v) != map.end()) + { + if (new_d < map[v]) + map[v] = new_d; + else + q.pop(); + } + else + map[v] = new_d; + } + } + + return map; + } + + + int + nltk_similarity::shortest_path_distance(const synset& synset1, const synset& synset2) + { + vertex v1 = synset1.id; + vertex v2 = synset2.id; + + std::map map1 = hypernym_map(v1); + std::map map2 = hypernym_map(v2); + + // For each ancestor synset common to both subject synsets, find the + // connecting path length. Return the shortest of these. + + int path_distance = -1; + std::map::iterator it, it2; + for (it = map1.begin(); it != map1.end(); it++) + for (it2 = map2.begin(); it2 != map2.end(); it2++) + if (fg[it->first] == fg[it2->first]) + { + int new_distance = it->second + it2->second; + if (path_distance < 0 || new_distance < path_distance) + path_distance = new_distance; + } + + return path_distance; + } + + + float + nltk_similarity::operator()(const synset& synset1, const synset& synset2, int) + { + int distance = shortest_path_distance(synset1, synset2); + if (distance >= 0) + return 1. / (distance + 1); + else + return -1; + } + + +} // end of namespace wnb + +#endif /* _NLTK_SIMILARITY_HH */ + diff --git a/contrib/wordnet-blast/wnb/std_ext.hh b/contrib/wordnet-blast/wnb/std_ext.hh new file mode 100644 index 00000000000..81a010731d8 --- /dev/null +++ b/contrib/wordnet-blast/wnb/std_ext.hh @@ -0,0 +1,90 @@ +#ifndef _STD_EXT_HH +# define _STD_EXT_HH + +# include +# include +# include +# include +# include + +namespace ext +{ + /// Read a file, return the content as a C++ string + inline + std::string read_file(const std::string& fn) + { + std::ifstream is; + is.open(fn.c_str(), std::ios::binary); + if (!is.is_open()) + throw std::runtime_error("File not found: " + fn); + + std::string str((std::istreambuf_iterator(is)), + std::istreambuf_iterator()); + + return str; + } + + /// Split a std::string + inline + std::vector split(const std::string& str) + { + std::vector tokens; + std::istringstream iss(str); + copy(std::istream_iterator(iss), + std::istream_iterator(), + std::back_inserter< std::vector >(tokens)); + return tokens; + } + + /// Split a std::string on separator + inline + std::vector split(const std::string& s, char seperator) + { + std::vector output; + std::string::size_type prev_pos = 0, pos = 0; + + while((pos = s.find(seperator, pos)) != std::string::npos) + { + std::string substring( s.substr(prev_pos, pos-prev_pos) ); + output.push_back(substring); + prev_pos = ++pos; + } + + output.push_back(s.substr(prev_pos, pos-prev_pos)); + return output; + } + + inline + bool + ends_with(const std::string& str, const std::string& ending) + { + if (str.length() >= ending.length()) + { + int cmp = str.compare(str.length() - ending.length(), + ending.length(), ending); + return (0 == cmp); + } + return false; +} + + + /// Sorted unique + template + inline + T s_unique(T& v) + { + T out; + + std::sort(v.begin(), v.end()); + typename T::iterator last = std::unique(v.begin(),v.end()); + + out.resize(last - v.begin()); + std::copy(v.begin(), last, out.begin()); + + return out; + } + +} // end of ext + +#endif /* _STD_EXT_HH */ + diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index 7e21bd0eed6..20b2cbf2387 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -123,4 +123,5 @@ endif() # Signed integer overflow on user-provided data inside boost::geometry - ignore. set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-sanitize=signed-integer-overflow) -target_link_libraries(clickhouse_functions PRIVATE stemmer) \ No newline at end of file +target_link_libraries(clickhouse_functions PRIVATE stemmer) +target_link_libraries(clickhouse_functions PRIVATE wnb) \ No newline at end of file diff --git a/src/Interpreters/SynonymsExtensions.cpp b/src/Functions/SynonymsExtensions.cpp similarity index 86% rename from src/Interpreters/SynonymsExtensions.cpp rename to src/Functions/SynonymsExtensions.cpp index 772f210a488..43779414ee9 100644 --- a/src/Interpreters/SynonymsExtensions.cpp +++ b/src/Functions/SynonymsExtensions.cpp @@ -1,10 +1,11 @@ #include -#include +#include #include #include #include +#include namespace DB { @@ -48,7 +49,7 @@ public: } } - Synset * getSynonyms(const std::string_view & token) const override + const Synset * getSynonyms(const std::string_view & token) const override { auto it = table.find(token); @@ -62,20 +63,23 @@ public: class WordnetSynonymsExtension : public ISynonymsExtension { private: - // std::vector> data; + wnb::wordnet wn; public: - WordnetSynonymsExtension(const String & /*path*/) - { - - } + WordnetSynonymsExtension(const String & path) : wn(path) {} - Synset * getSynonyms(const std::string_view & /*token*/) const override + const Synset * getSynonyms(const std::string_view & token) const override { - return nullptr; + return wn.get_synset(std::string(token)); } }; +/// Duplicate of code from StringUtils.h. Copied here for less dependencies. +static bool startsWith(const std::string & s, const char * prefix) +{ + return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix)); +} + SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration & config) { String prefix = "synonyms_extensions"; @@ -89,7 +93,7 @@ SynonymsExtensions::SynonymsExtensions(const Poco::Util::AbstractConfiguration & for (const auto & key : keys) { - if (key == "extension") + if (startsWith(key, "extension")) { const auto & ext_name = config.getString(prefix + "." + key + ".name", ""); const auto & ext_path = config.getString(prefix + "." + key + ".path", ""); diff --git a/src/Interpreters/SynonymsExtensions.h b/src/Functions/SynonymsExtensions.h similarity index 91% rename from src/Interpreters/SynonymsExtensions.h rename to src/Functions/SynonymsExtensions.h index 0d97d672130..d8bd5fc3029 100644 --- a/src/Interpreters/SynonymsExtensions.h +++ b/src/Functions/SynonymsExtensions.h @@ -19,7 +19,7 @@ public: //ISynonymsExtension(const String & path); - virtual Synset * getSynonyms(const std::string_view & token) const = 0; + virtual const Synset * getSynonyms(const std::string_view & token) const = 0; virtual ~ISynonymsExtension() = default; }; diff --git a/src/Functions/synonyms.cpp b/src/Functions/synonyms.cpp index fc03c10e3e4..00b45f021c4 100644 --- a/src/Functions/synonyms.cpp +++ b/src/Functions/synonyms.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index f3b4dffd1f3..9eab10eef09 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -53,7 +53,6 @@ #include #include #include -#include #include #include #include @@ -75,7 +74,7 @@ #include #include #include - +#include namespace ProfileEvents {