From d66d97790bbcde76ef5eba77929887ed0a3dcb6e Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Tue, 14 Feb 2012 22:13:34 +0000 Subject: [PATCH] [LUCENE-3731] - Creating the analysis-uima module for UIMA based tokenizers/analyzers git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1244236 13f79535-47bb-0310-9956-ffa450edef68 --- dev-tools/idea/.idea/ant.xml | 1 + dev-tools/idea/.idea/modules.xml | 1 + dev-tools/idea/.idea/workspace.xml | 7 + dev-tools/idea/solr/contrib/uima/uima.iml | 3 +- .../maven/modules/analysis/pom.xml.template | 1 + lucene/contrib/contrib-build.xml | 11 + modules/analysis/README.txt | 6 + modules/analysis/build.xml | 15 +- modules/analysis/uima/build.xml | 46 ++++ .../uima/lib/uima-an-tagger-2.3.1.jar | 2 + .../uima/lib/uima-an-tagger-LICENSE-ASL.txt | 202 ++++++++++++++++++ .../uima/lib/uima-an-tagger-NOTICE.txt | 7 + .../analysis/uima/lib/uima-an-wst-2.3.1.jar | 2 + .../uima/lib/uima-an-wst-LICENSE-ASL.txt | 202 ++++++++++++++++++ .../analysis/uima/lib/uima-an-wst-NOTICE.txt | 7 + .../analysis/uima/lib/uimaj-core-2.3.1.jar | 2 + .../uima/lib/uimaj-core-LICENSE-ASL.txt | 202 ++++++++++++++++++ .../analysis/uima/lib/uimaj-core-NOTICE.txt | 13 ++ .../analysis/uima/BaseUIMATokenizer.java | 81 +++++++ .../uima/UIMAAnnotationsTokenizer.java | 93 ++++++++ .../analysis/uima/UIMABaseAnalyzer.java | 42 ++++ .../analysis/uima/UIMATypeAwareAnalyzer.java | 42 ++++ .../UIMATypeAwareAnnotationsTokenizer.java | 110 ++++++++++ .../lucene/analysis/uima/ae/AEProvider.java | 36 ++++ .../analysis/uima/ae/AEProviderFactory.java | 73 +++++++ .../analysis/uima/ae/BasicAEProvider.java | 67 ++++++ .../uima/ae/OverridingParamsAEProvider.java | 102 +++++++++ .../resources/uima/AggregateSentenceAE.xml | 70 ++++++ .../uima/AggregateDummySentenceAE.xml | 59 +++++ .../src/test-files/uima/DummyEntityAE.xml | 68 ++++++ .../src/test-files/uima/DummyPoSTagger.xml | 50 +++++ .../analysis/uima/UIMABaseAnalyzerTest.java | 125 +++++++++++ .../uima/UIMATypeAwareAnalyzerTest.java | 67 ++++++ .../analysis/uima/ae/BasicAEProviderTest.java | 36 ++++ .../ae/OverridingParamsAEProviderTest.java | 61 ++++++ .../uima/an/DummyEntityAnnotator.java | 61 ++++++ .../analysis/uima/an/DummyPoSTagger.java | 57 +++++ 37 files changed, 2028 insertions(+), 2 deletions(-) create mode 100644 modules/analysis/uima/build.xml create mode 100644 modules/analysis/uima/lib/uima-an-tagger-2.3.1.jar create mode 100644 modules/analysis/uima/lib/uima-an-tagger-LICENSE-ASL.txt create mode 100644 modules/analysis/uima/lib/uima-an-tagger-NOTICE.txt create mode 100644 modules/analysis/uima/lib/uima-an-wst-2.3.1.jar create mode 100644 modules/analysis/uima/lib/uima-an-wst-LICENSE-ASL.txt create mode 100644 modules/analysis/uima/lib/uima-an-wst-NOTICE.txt create mode 100644 modules/analysis/uima/lib/uimaj-core-2.3.1.jar create mode 100644 modules/analysis/uima/lib/uimaj-core-LICENSE-ASL.txt create mode 100644 modules/analysis/uima/lib/uimaj-core-NOTICE.txt create mode 100644 modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java create mode 100644 modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java create mode 100644 modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMABaseAnalyzer.java create mode 100644 modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzer.java create mode 100644 modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java create mode 100644 modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/AEProvider.java create mode 100644 modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/AEProviderFactory.java create mode 100644 modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java create mode 100644 modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java create mode 100644 modules/analysis/uima/src/resources/uima/AggregateSentenceAE.xml create mode 100644 modules/analysis/uima/src/test-files/uima/AggregateDummySentenceAE.xml create mode 100644 modules/analysis/uima/src/test-files/uima/DummyEntityAE.xml create mode 100644 modules/analysis/uima/src/test-files/uima/DummyPoSTagger.xml create mode 100644 modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java create mode 100644 modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java create mode 100644 modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/BasicAEProviderTest.java create mode 100644 modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProviderTest.java create mode 100644 modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyEntityAnnotator.java create mode 100644 modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyPoSTagger.java diff --git a/dev-tools/idea/.idea/ant.xml b/dev-tools/idea/.idea/ant.xml index 4f4642abd4e..aad6d5771da 100644 --- a/dev-tools/idea/.idea/ant.xml +++ b/dev-tools/idea/.idea/ant.xml @@ -19,6 +19,7 @@ + diff --git a/dev-tools/idea/.idea/modules.xml b/dev-tools/idea/.idea/modules.xml index 2b7a15d3a54..3d1682b497b 100644 --- a/dev-tools/idea/.idea/modules.xml +++ b/dev-tools/idea/.idea/modules.xml @@ -17,6 +17,7 @@ + diff --git a/dev-tools/idea/.idea/workspace.xml b/dev-tools/idea/.idea/workspace.xml index 66cca3f8ad0..a0505cadb80 100644 --- a/dev-tools/idea/.idea/workspace.xml +++ b/dev-tools/idea/.idea/workspace.xml @@ -186,6 +186,13 @@ + + + + diff --git a/dev-tools/maven/modules/analysis/pom.xml.template b/dev-tools/maven/modules/analysis/pom.xml.template index bb6a6f93a1c..51f8fb8ad1c 100644 --- a/dev-tools/maven/modules/analysis/pom.xml.template +++ b/dev-tools/maven/modules/analysis/pom.xml.template @@ -38,6 +38,7 @@ phonetic smartcn stempel + uima build/lucene-analysis-modules-aggregator diff --git a/lucene/contrib/contrib-build.xml b/lucene/contrib/contrib-build.xml index a64d9f2bfa5..22ef4e61053 100644 --- a/lucene/contrib/contrib-build.xml +++ b/lucene/contrib/contrib-build.xml @@ -162,6 +162,17 @@ + + + + + + + + + + + diff --git a/modules/analysis/README.txt b/modules/analysis/README.txt index 797911a7874..b579ad4dc63 100644 --- a/modules/analysis/README.txt +++ b/modules/analysis/README.txt @@ -41,6 +41,10 @@ lucene-analyzers-stempel-XX.jar An add-on analysis library that contains a universal algorithmic stemmer, including tables for the Polish language. +lucene-analyzers-uima-XX.jar + An add-on analysis library that contains tokenizers/analyzers using + Apache UIMA extracted annotations to identify tokens/types/etc. + common/src/java icu/src/java kuromoji/src/java @@ -48,6 +52,7 @@ morfologik/src/java phonetic/src/java smartcn/src/java stempel/src/java +uima/src/java The source code for the libraries. common/src/test @@ -57,4 +62,5 @@ morfologik/src/test phonetic/src/test smartcn/src/test stempel/src/test +uima/src/test Unit tests for the libraries. diff --git a/modules/analysis/build.xml b/modules/analysis/build.xml index 7df7f1ae917..ea761b18af4 100644 --- a/modules/analysis/build.xml +++ b/modules/analysis/build.xml @@ -27,6 +27,7 @@ - morfologik: Morfologik Stemmer - smartcn: Smart Analyzer for Simplified Chinese Text - stempel: Algorithmic Stemmer for Polish + - uima: UIMA Analysis module @@ -57,8 +58,12 @@ + + + + - + @@ -68,6 +73,7 @@ + @@ -77,6 +83,7 @@ + @@ -86,6 +93,7 @@ + @@ -95,6 +103,7 @@ + @@ -104,6 +113,7 @@ + @@ -116,6 +126,7 @@ + @@ -126,6 +137,7 @@ + @@ -136,6 +148,7 @@ + diff --git a/modules/analysis/uima/build.xml b/modules/analysis/uima/build.xml new file mode 100644 index 00000000000..8998c090434 --- /dev/null +++ b/modules/analysis/uima/build.xml @@ -0,0 +1,46 @@ + + + + + + + + UIMA Analysis module + + + + + + + + + + + + + + + + + + + + + + + diff --git a/modules/analysis/uima/lib/uima-an-tagger-2.3.1.jar b/modules/analysis/uima/lib/uima-an-tagger-2.3.1.jar new file mode 100644 index 00000000000..e30333f76a9 --- /dev/null +++ b/modules/analysis/uima/lib/uima-an-tagger-2.3.1.jar @@ -0,0 +1,2 @@ +AnyObjectId[dffd510b7429dcbe37a283da92cbf06c1cfbe383] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/modules/analysis/uima/lib/uima-an-tagger-LICENSE-ASL.txt b/modules/analysis/uima/lib/uima-an-tagger-LICENSE-ASL.txt new file mode 100644 index 00000000000..d6456956733 --- /dev/null +++ b/modules/analysis/uima/lib/uima-an-tagger-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/modules/analysis/uima/lib/uima-an-tagger-NOTICE.txt b/modules/analysis/uima/lib/uima-an-tagger-NOTICE.txt new file mode 100644 index 00000000000..f0333573f58 --- /dev/null +++ b/modules/analysis/uima/lib/uima-an-tagger-NOTICE.txt @@ -0,0 +1,7 @@ + +UIMA Annotator: Tagger +Copyright 2006-2010 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + diff --git a/modules/analysis/uima/lib/uima-an-wst-2.3.1.jar b/modules/analysis/uima/lib/uima-an-wst-2.3.1.jar new file mode 100644 index 00000000000..289f397f6ba --- /dev/null +++ b/modules/analysis/uima/lib/uima-an-wst-2.3.1.jar @@ -0,0 +1,2 @@ +AnyObjectId[10866014d8887bfdd8bfec43d3fdd780428d4ed4] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/modules/analysis/uima/lib/uima-an-wst-LICENSE-ASL.txt b/modules/analysis/uima/lib/uima-an-wst-LICENSE-ASL.txt new file mode 100644 index 00000000000..d6456956733 --- /dev/null +++ b/modules/analysis/uima/lib/uima-an-wst-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/modules/analysis/uima/lib/uima-an-wst-NOTICE.txt b/modules/analysis/uima/lib/uima-an-wst-NOTICE.txt new file mode 100644 index 00000000000..8169ca6a328 --- /dev/null +++ b/modules/analysis/uima/lib/uima-an-wst-NOTICE.txt @@ -0,0 +1,7 @@ + +UIMA Annotator: WhitespaceTokenizer +Copyright 2006-2010 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + diff --git a/modules/analysis/uima/lib/uimaj-core-2.3.1.jar b/modules/analysis/uima/lib/uimaj-core-2.3.1.jar new file mode 100644 index 00000000000..4c7359f79ae --- /dev/null +++ b/modules/analysis/uima/lib/uimaj-core-2.3.1.jar @@ -0,0 +1,2 @@ +AnyObjectId[e876a9749eed73ec2c95b83cf534d7a373130569] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/modules/analysis/uima/lib/uimaj-core-LICENSE-ASL.txt b/modules/analysis/uima/lib/uimaj-core-LICENSE-ASL.txt new file mode 100644 index 00000000000..d6456956733 --- /dev/null +++ b/modules/analysis/uima/lib/uimaj-core-LICENSE-ASL.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/modules/analysis/uima/lib/uimaj-core-NOTICE.txt b/modules/analysis/uima/lib/uimaj-core-NOTICE.txt new file mode 100644 index 00000000000..8f69a54f4ed --- /dev/null +++ b/modules/analysis/uima/lib/uimaj-core-NOTICE.txt @@ -0,0 +1,13 @@ + +UIMA Base: uimaj-core +Copyright 2006-2010 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Portions of Apache UIMA were originally developed by +International Business Machines Corporation and are +licensed to the Apache Software Foundation under the +"Software Grant License Agreement", informally known as the +"IBM UIMA License Agreement". +Copyright (c) 2003, 2006 IBM Corporation. diff --git a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java new file mode 100644 index 00000000000..a20ac1c2f22 --- /dev/null +++ b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java @@ -0,0 +1,81 @@ +package org.apache.lucene.analysis.uima; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.resource.ResourceInitializationException; + +import java.io.IOException; +import java.io.Reader; + +/** + * Abstract base implementation of a {@link Tokenizer} which is able to analyze the given input with a + * UIMA {@link AnalysisEngine} + */ +public abstract class BaseUIMATokenizer extends Tokenizer { + + protected FSIterator iterator; + + protected BaseUIMATokenizer(Reader reader) { + super(reader); + } + + /** + * analyzes the tokenizer input using the given analysis engine + * + * @param analysisEngine the AE to use for analyzing the tokenizer input + * @return CAS with extracted metadata (UIMA annotations, feature structures) + * @throws ResourceInitializationException + * + * @throws AnalysisEngineProcessException + * @throws IOException + */ + protected CAS analyzeInput(AnalysisEngine analysisEngine) throws ResourceInitializationException, + AnalysisEngineProcessException, IOException { + CAS cas = analysisEngine.newCAS(); + cas.setDocumentText(toString(input)); + analysisEngine.process(cas); + analysisEngine.destroy(); + return cas; + } + + private String toString(Reader reader) throws IOException { + StringBuilder stringBuilder = new StringBuilder(); + int ch; + while ((ch = reader.read()) > -1) { + stringBuilder.append((char) ch); + } + return stringBuilder.toString(); + } + + @Override + public void reset(Reader input) throws IOException { + super.reset(input); + iterator = null; + } + + @Override + public void end() throws IOException { + iterator = null; + } +} diff --git a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java new file mode 100644 index 00000000000..6191c88672a --- /dev/null +++ b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java @@ -0,0 +1,93 @@ +package org.apache.lucene.analysis.uima; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.uima.ae.AEProviderFactory; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.InvalidXMLException; + +import java.io.IOException; +import java.io.Reader; + +/** + * a {@link Tokenizer} which creates tokens from UIMA Annotations + */ +public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer { + + private final CharTermAttribute termAttr; + + private final OffsetAttribute offsetAttr; + + private final String tokenTypeString; + + private final String descriptorPath; + + private int finalOffset = 0; + + public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Reader input) { + super(input); + this.tokenTypeString = tokenType; + this.termAttr = addAttribute(CharTermAttribute.class); + this.offsetAttr = addAttribute(OffsetAttribute.class); + this.descriptorPath = descriptorPath; + } + + private void analyzeText(String descriptorPath) throws IOException, ResourceInitializationException, + AnalysisEngineProcessException { + AnalysisEngine ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE(); + CAS cas = analyzeInput(ae); + finalOffset = correctOffset(cas.getDocumentText().length()); + Type tokenType = cas.getTypeSystem().getType(tokenTypeString); + iterator = cas.getAnnotationIndex(tokenType).iterator(); + } + + @Override + public boolean incrementToken() throws IOException { + if (iterator == null) { + try { + analyzeText(descriptorPath); + } catch (Exception e) { + throw new IOException(e); + } + } + if (iterator.hasNext()) { + clearAttributes(); + AnnotationFS next = iterator.next(); + termAttr.append(next.getCoveredText()); + offsetAttr.setOffset(correctOffset(next.getBegin()), correctOffset(next.getEnd())); + return true; + } else { + return false; + } + } + + @Override + public void end() throws IOException { + if (offsetAttr.endOffset() < finalOffset) + offsetAttr.setOffset(finalOffset, finalOffset); + super.end(); + } +} diff --git a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMABaseAnalyzer.java b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMABaseAnalyzer.java new file mode 100644 index 00000000000..e7fb4b25621 --- /dev/null +++ b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMABaseAnalyzer.java @@ -0,0 +1,42 @@ +package org.apache.lucene.analysis.uima; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; + +import java.io.Reader; + +/** + * An {@link Analyzer} which use the {@link UIMAAnnotationsTokenizer} for creating tokens + */ +public final class UIMABaseAnalyzer extends Analyzer { + + private final String descriptorPath; + private final String tokenType; + + public UIMABaseAnalyzer(String descriptorPath, String tokenType) { + this.descriptorPath = descriptorPath; + this.tokenType = tokenType; + } + + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + return new TokenStreamComponents(new UIMAAnnotationsTokenizer(descriptorPath, tokenType, reader)); + } + +} diff --git a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzer.java b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzer.java new file mode 100644 index 00000000000..930351da377 --- /dev/null +++ b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzer.java @@ -0,0 +1,42 @@ +package org.apache.lucene.analysis.uima; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; + +import java.io.Reader; + +/** + * {@link Analyzer} which uses the {@link UIMATypeAwareAnnotationsTokenizer} for the tokenization phase + */ +public final class UIMATypeAwareAnalyzer extends Analyzer { + private final String descriptorPath; + private final String tokenType; + private final String featurePath; + + public UIMATypeAwareAnalyzer(String descriptorPath, String tokenType, String featurePath) { + this.descriptorPath = descriptorPath; + this.tokenType = tokenType; + this.featurePath = featurePath; + } + + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + return new TokenStreamComponents(new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, reader)); + } +} diff --git a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java new file mode 100644 index 00000000000..016b8ee466d --- /dev/null +++ b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java @@ -0,0 +1,110 @@ +package org.apache.lucene.analysis.uima; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.uima.ae.AEProviderFactory; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.CASException; +import org.apache.uima.cas.FeaturePath; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.InvalidXMLException; + +import java.io.IOException; +import java.io.Reader; + +/** + * A {@link Tokenizer} which creates token from UIMA Annotations filling also their {@link TypeAttribute} according to + * {@link org.apache.uima.cas.FeaturePath}s specified + */ +public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer { + + private final TypeAttribute typeAttr; + + private final CharTermAttribute termAttr; + + private final OffsetAttribute offsetAttr; + + private final String tokenTypeString; + + private final String descriptorPath; + + private final String typeAttributeFeaturePath; + + private FeaturePath featurePath; + + private int finalOffset = 0; + + public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Reader input) { + super(input); + this.tokenTypeString = tokenType; + this.termAttr = addAttribute(CharTermAttribute.class); + this.typeAttr = addAttribute(TypeAttribute.class); + this.offsetAttr = addAttribute(OffsetAttribute.class); + this.typeAttributeFeaturePath = typeAttributeFeaturePath; + this.descriptorPath = descriptorPath; + } + + private void analyzeText() throws IOException, ResourceInitializationException, AnalysisEngineProcessException, + CASException { + AnalysisEngine ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE(); + CAS cas = analyzeInput(ae); + finalOffset = correctOffset(cas.getDocumentText().length()); + Type tokenType = cas.getTypeSystem().getType(tokenTypeString); + iterator = cas.getAnnotationIndex(tokenType).iterator(); + featurePath = cas.createFeaturePath(); + featurePath.initialize(typeAttributeFeaturePath); + } + + @Override + public boolean incrementToken() throws IOException { + if (iterator == null) { + try { + analyzeText(); + } catch (Exception e) { + throw new IOException(e); + } + } + if (iterator.hasNext()) { + clearAttributes(); + AnnotationFS next = iterator.next(); + termAttr.append(next.getCoveredText()); + offsetAttr.setOffset(correctOffset(next.getBegin()), correctOffset(next.getEnd())); + typeAttr.setType(featurePath.getValueAsString(next)); + return true; + } else { + return false; + } + } + + @Override + public void end() throws IOException { + if (offsetAttr.endOffset() < finalOffset) + offsetAttr.setOffset(finalOffset, finalOffset); + super.end(); + } + + +} diff --git a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/AEProvider.java b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/AEProvider.java new file mode 100644 index 00000000000..6615a7de9ce --- /dev/null +++ b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/AEProvider.java @@ -0,0 +1,36 @@ +package org.apache.lucene.analysis.uima.ae; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.resource.ResourceInitializationException; + +/** + * provide an Apache UIMA {@link AnalysisEngine} + * + */ +public interface AEProvider { + + /** + * + * @return + * @throws ResourceInitializationException + */ + public AnalysisEngine getAE() throws ResourceInitializationException; + +} diff --git a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/AEProviderFactory.java b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/AEProviderFactory.java new file mode 100644 index 00000000000..46be1a66232 --- /dev/null +++ b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/AEProviderFactory.java @@ -0,0 +1,73 @@ +package org.apache.lucene.analysis.uima.ae; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashMap; +import java.util.Map; + +/** + * Singleton factory class responsible of {@link AEProvider}s' creation + * + */ +public class AEProviderFactory { + + private static AEProviderFactory instance; + + private final Map providerCache = new HashMap(); + + private AEProviderFactory() { + // Singleton + } + + public static AEProviderFactory getInstance() { + if (instance == null) { + instance = new AEProviderFactory(); + } + return instance; + } + + /** + * + * @param keyPrefix + * @param aePath + * @return + */ + public synchronized AEProvider getAEProvider(String keyPrefix, String aePath) { + String key = new StringBuilder(keyPrefix).append(aePath).append(BasicAEProvider.class).toString(); + if (providerCache.get(key) == null) { + providerCache.put(key, new BasicAEProvider(aePath)); + } + return providerCache.get(key); + } + + /** + * + * @param keyPrefix + * @param aePath + * @param runtimeParameters + * @return + */ + public synchronized AEProvider getAEProvider(String keyPrefix, String aePath, + Map runtimeParameters) { + String key = new StringBuilder(keyPrefix).append(aePath).append(OverridingParamsAEProvider.class).toString(); + if (providerCache.get(key) == null) { + providerCache.put(key, new OverridingParamsAEProvider(aePath, runtimeParameters)); + } + return providerCache.get(key); + } +} diff --git a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java new file mode 100644 index 00000000000..aceacda5eb6 --- /dev/null +++ b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/BasicAEProvider.java @@ -0,0 +1,67 @@ +package org.apache.lucene.analysis.uima.ae; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.uima.UIMAFramework; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.XMLInputSource; + +/** + * Basic {@link AEProvider} which just instantiates a UIMA {@link AnalysisEngine} with no additional metadata, + * parameters or resources + */ +public class BasicAEProvider implements AEProvider { + + private final String aePath; + private AnalysisEngine cachedAE; + + public BasicAEProvider(String aePath) { + this.aePath = aePath; + } + + @Override + public synchronized AnalysisEngine getAE() throws ResourceInitializationException { + try { + if (cachedAE == null) { + // get Resource Specifier from XML file + + XMLInputSource in; + try { + in = new XMLInputSource(aePath); + } catch (Exception e) { + in = new XMLInputSource(getClass().getResource(aePath)); + } + + // get AE description + AnalysisEngineDescription desc = UIMAFramework.getXMLParser() + .parseAnalysisEngineDescription(in); + + // create AE here + cachedAE = UIMAFramework.produceAnalysisEngine(desc); + } else { + cachedAE.reconfigure(); + } + } catch (Exception e) { + cachedAE = null; + throw new ResourceInitializationException(e); + } + return cachedAE; + } +} diff --git a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java new file mode 100644 index 00000000000..4d0938b590a --- /dev/null +++ b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProvider.java @@ -0,0 +1,102 @@ +package org.apache.lucene.analysis.uima.ae; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.uima.UIMAFramework; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.XMLInputSource; + +import java.util.Map; + +/** + * {@link AEProvider} implementation that creates an Aggregate AE from the given path, also + * injecting runtime parameters defined in the solrconfig.xml Solr configuration file and assigning + * them as overriding parameters in the aggregate AE + */ +public class OverridingParamsAEProvider implements AEProvider { + + private final String aePath; + + private AnalysisEngine cachedAE; + + private final Map runtimeParameters; + + public OverridingParamsAEProvider(String aePath, Map runtimeParameters) { + this.aePath = aePath; + this.runtimeParameters = runtimeParameters; + } + + @Override + public synchronized AnalysisEngine getAE() throws ResourceInitializationException { + try { + if (cachedAE == null) { + // get Resource Specifier from XML file + XMLInputSource in; + try { + in = new XMLInputSource(aePath); + } catch (Exception e) { + in = new XMLInputSource(getClass().getResource(aePath)); + } + + // get AE description + AnalysisEngineDescription desc = UIMAFramework.getXMLParser() + .parseAnalysisEngineDescription(in); + + /* iterate over each AE (to set runtime parameters) */ + for (String attributeName : runtimeParameters.keySet()) { + Object val = getRuntimeValue(desc, attributeName); + desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue( + attributeName, val); + } + // create AE here + cachedAE = UIMAFramework.produceAnalysisEngine(desc); + } else { + cachedAE.reconfigure(); + } + } catch (Exception e) { + cachedAE = null; + throw new ResourceInitializationException(e); + } + return cachedAE; + } + + /* create the value to inject in the runtime parameter depending on its declared type */ + private Object getRuntimeValue(AnalysisEngineDescription desc, String attributeName) { + String type = desc.getAnalysisEngineMetaData().getConfigurationParameterDeclarations(). + getConfigurationParameter(null, attributeName).getType(); + // TODO : do it via reflection ? i.e. Class paramType = Class.forName(type)... + Object val = null; + Object runtimeValue = runtimeParameters.get(attributeName); + if (runtimeValue != null) { + if ("String".equals(type)) { + val = String.valueOf(runtimeValue); + } else if ("Integer".equals(type)) { + val = Integer.valueOf(runtimeValue.toString()); + } else if ("Boolean".equals(type)) { + val = Boolean.valueOf(runtimeValue.toString()); + } else if ("Float".equals(type)) { + val = Float.valueOf(runtimeValue.toString()); + } + } + + return val; + } + +} \ No newline at end of file diff --git a/modules/analysis/uima/src/resources/uima/AggregateSentenceAE.xml b/modules/analysis/uima/src/resources/uima/AggregateSentenceAE.xml new file mode 100644 index 00000000000..73d697e220d --- /dev/null +++ b/modules/analysis/uima/src/resources/uima/AggregateSentenceAE.xml @@ -0,0 +1,70 @@ + + + + org.apache.uima.java + false + + + + + + + + + + AggregateSentenceAE + + 1.0 + + + + ngramsize + Integer + false + false + + HmmTagger/NGRAM_SIZE + + + + + + + WhitespaceTokenizer + HmmTagger + + + + + + + + org.apache.uima.SentenceAnnotation + org.apache.uima.TokenAnnotation + + + + + + true + true + false + + + + diff --git a/modules/analysis/uima/src/test-files/uima/AggregateDummySentenceAE.xml b/modules/analysis/uima/src/test-files/uima/AggregateDummySentenceAE.xml new file mode 100644 index 00000000000..8769b189404 --- /dev/null +++ b/modules/analysis/uima/src/test-files/uima/AggregateDummySentenceAE.xml @@ -0,0 +1,59 @@ + + + + org.apache.uima.java + false + + + + + + + + + + AggregateSentenceAE + + 1.0 + + + + + WhitespaceTokenizer + DummyPoSTagger + + + + + + + + org.apache.uima.SentenceAnnotation + org.apache.uima.TokenAnnotation + + + + + + true + true + false + + + + diff --git a/modules/analysis/uima/src/test-files/uima/DummyEntityAE.xml b/modules/analysis/uima/src/test-files/uima/DummyEntityAE.xml new file mode 100644 index 00000000000..8827562a569 --- /dev/null +++ b/modules/analysis/uima/src/test-files/uima/DummyEntityAE.xml @@ -0,0 +1,68 @@ + + + + org.apache.uima.java + true + org.apache.lucene.analysis.uima.an.DummyEntityAnnotator + + DummyPoSTagger + + 1.0 + ASF + + + + + + org.apache.solr.uima.ts.EntityAnnotation + + uima.tcas.Annotation + + + name + + uima.cas.String + + + entity + + uima.cas.String + + + + + + + + + + + + org.apache.solr.uima.ts.EntityAnnotation + + + + + + true + true + false + + + + diff --git a/modules/analysis/uima/src/test-files/uima/DummyPoSTagger.xml b/modules/analysis/uima/src/test-files/uima/DummyPoSTagger.xml new file mode 100644 index 00000000000..7677502b959 --- /dev/null +++ b/modules/analysis/uima/src/test-files/uima/DummyPoSTagger.xml @@ -0,0 +1,50 @@ + + + + org.apache.uima.java + true + org.apache.lucene.analysis.uima.an.DummyPoSTagger + + DummyPoSTagger + + 1.0 + ASF + + + + + + + + + org.apache.uima.TokenAnnotation + + + org.apache.uima.TokenAnnotation + + + + + + true + true + false + + + + diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java new file mode 100644 index 00000000000..7b828527201 --- /dev/null +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java @@ -0,0 +1,125 @@ +package org.apache.lucene.analysis.uima; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.Version; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.StringReader; + +/** + * Testcase for {@link UIMABaseAnalyzer} + */ +public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase { + + private UIMABaseAnalyzer analyzer; + + @Before + public void setUp() throws Exception { + super.setUp(); + analyzer = new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation"); + } + + @After + public void tearDown() throws Exception { + analyzer.close(); + super.tearDown(); + } + + @Test + public void baseUIMAAnalyzerStreamTest() throws Exception { + TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood")); + assertTokenStreamContents(ts, new String[]{"the", "big", "brown", "fox", "jumped", "on", "the", "wood"}); + } + + @Test + public void baseUIMAAnalyzerIntegrationTest() throws Exception { + Directory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_40, analyzer)); + // add the first doc + Document doc = new Document(); + String dummyTitle = "this is a dummy title "; + doc.add(new Field("title", dummyTitle, TextField.TYPE_STORED)); + String dummyContent = "there is some content written here"; + doc.add(new Field("contents", dummyContent, TextField.TYPE_STORED)); + writer.addDocument(doc, analyzer); + writer.commit(); + + // try the search over the first doc + DirectoryReader directoryReader = DirectoryReader.open(dir); + IndexSearcher indexSearcher = new IndexSearcher(directoryReader); + TopDocs result = indexSearcher.search(new MatchAllDocsQuery(), 10); + assertTrue(result.totalHits > 0); + Document d = indexSearcher.doc(result.scoreDocs[0].doc); + assertNotNull(d); + assertNotNull(d.getField("title")); + assertEquals(dummyTitle, d.getField("title").stringValue()); + assertNotNull(d.getField("contents")); + assertEquals(dummyContent, d.getField("contents").stringValue()); + + // add a second doc + doc = new Document(); + String dogmasTitle = "dogmas"; + doc.add(new Field("title", dogmasTitle, TextField.TYPE_STORED)); + String dogmasContents = "white men can't jump"; + doc.add(new Field("contents", dogmasContents, TextField.TYPE_STORED)); + writer.addDocument(doc, analyzer); + writer.commit(); + + directoryReader.close(); + directoryReader = DirectoryReader.open(dir); + indexSearcher = new IndexSearcher(directoryReader); + result = indexSearcher.search(new MatchAllDocsQuery(), 10); + Document d1 = indexSearcher.doc(result.scoreDocs[1].doc); + assertNotNull(d1); + assertNotNull(d1.getField("title")); + assertEquals(dogmasTitle, d1.getField("title").stringValue()); + assertNotNull(d1.getField("contents")); + assertEquals(dogmasContents, d1.getField("contents").stringValue()); + + // do a matchalldocs query to retrieve both docs + indexSearcher = new IndexSearcher(directoryReader); + result = indexSearcher.search(new MatchAllDocsQuery(), 10); + assertEquals(2, result.totalHits); + writer.close(); + indexSearcher.getIndexReader().close(); + dir.close(); + } + + @Test + public void testRandomStrings() throws Exception { + checkRandomData(random, new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation"), + 1000 * RANDOM_MULTIPLIER); + } + +} diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java new file mode 100644 index 00000000000..e7b4de9ea67 --- /dev/null +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java @@ -0,0 +1,67 @@ +package org.apache.lucene.analysis.uima; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.StringReader; + +/** + * Testcase for {@link UIMATypeAwareAnalyzer} + */ +public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase { + + private UIMATypeAwareAnalyzer analyzer; + + @Before + public void setUp() throws Exception { + super.setUp(); + analyzer = new UIMATypeAwareAnalyzer("/uima/AggregateSentenceAE.xml", + "org.apache.uima.TokenAnnotation", "posTag"); + } + + @After + public void tearDown() throws Exception { + analyzer.close(); + super.tearDown(); + } + + @Test + public void baseUIMATypeAwareAnalyzerStreamTest() throws Exception { + + // create a token stream + TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood")); + + // check that 'the big brown fox jumped on the wood' tokens have the expected PoS types + assertTokenStreamContents(ts, + new String[]{"the", "big", "brown", "fox", "jumped", "on", "the", "wood"}, + new String[]{"at", "jj", "jj", "nn", "vbd", "in", "at", "nn"}); + + } + + @Test + public void testRandomStrings() throws Exception { + checkRandomData(random, new UIMATypeAwareAnalyzer("/uima/AggregateDummySentenceAE.xml", + "org.apache.uima.TokenAnnotation", "tokenType"), 1000 * RANDOM_MULTIPLIER); + } + +} diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/BasicAEProviderTest.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/BasicAEProviderTest.java new file mode 100644 index 00000000000..08735d9d713 --- /dev/null +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/BasicAEProviderTest.java @@ -0,0 +1,36 @@ +package org.apache.lucene.analysis.uima.ae; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.junit.Test; + +import static org.junit.Assert.assertNotNull; + +/** + * TestCase for {@link BasicAEProvider} + */ +public class BasicAEProviderTest { + + @Test + public void testBasicInititalization() throws Exception { + AEProvider basicAEProvider = new BasicAEProvider("/uima/DummyEntityAE.xml"); + AnalysisEngine analysisEngine = basicAEProvider.getAE(); + assertNotNull(analysisEngine); + } +} diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProviderTest.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProviderTest.java new file mode 100644 index 00000000000..f8325fe5968 --- /dev/null +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProviderTest.java @@ -0,0 +1,61 @@ +package org.apache.lucene.analysis.uima.ae; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.resource.ResourceInitializationException; +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; + +import static org.junit.Assert.*; + +/** + * TestCase for {@link OverridingParamsAEProvider} + */ +public class OverridingParamsAEProviderTest { + + @Test + public void testNullMapInitialization() throws Exception { + try { + AEProvider aeProvider = new OverridingParamsAEProvider("/uima/DummyEntityAE.xml", null); + aeProvider.getAE(); + fail("should fail due to null Map passed"); + } catch (ResourceInitializationException e) { + // everything ok + } + } + + @Test + public void testEmptyMapInitialization() throws Exception { + AEProvider aeProvider = new OverridingParamsAEProvider("/uima/DummyEntityAE.xml", new HashMap()); + AnalysisEngine analysisEngine = aeProvider.getAE(); + assertNotNull(analysisEngine); + } + + @Test + public void testOverridingParamsInitialization() throws Exception { + Map runtimeParameters = new HashMap(); + runtimeParameters.put("ngramsize", "3"); + AEProvider aeProvider = new OverridingParamsAEProvider("/uima/AggregateSentenceAE.xml", runtimeParameters); + AnalysisEngine analysisEngine = aeProvider.getAE(); + assertNotNull(analysisEngine); + assertEquals(analysisEngine.getConfigParameterValue("ngramsize"), 3); + } +} diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyEntityAnnotator.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyEntityAnnotator.java new file mode 100644 index 00000000000..bd6cc9c9e67 --- /dev/null +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyEntityAnnotator.java @@ -0,0 +1,61 @@ +package org.apache.lucene.analysis.uima.an; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.uima.TokenAnnotation; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; + +public class DummyEntityAnnotator extends JCasAnnotator_ImplBase { + + private static final String NP = "np"; + private static final String NPS = "nps"; + private static final String TYPE_NAME = "org.apache.lucene.analysis.uima.ts.EntityAnnotation"; + private static final String ENTITY_FEATURE = "entity"; + private static final String NAME_FEATURE = "entity"; + + @Override + public void process(JCas jcas) throws AnalysisEngineProcessException { + Type type = jcas.getCas().getTypeSystem().getType(TYPE_NAME); + Feature entityFeature = type.getFeatureByBaseName(ENTITY_FEATURE); + Feature nameFeature = type.getFeatureByBaseName(NAME_FEATURE); + + for (Annotation annotation : jcas.getAnnotationIndex(TokenAnnotation.type)) { + String tokenPOS = ((TokenAnnotation) annotation).getPosTag(); + + if (NP.equals(tokenPOS) || NPS.equals(tokenPOS)) { + AnnotationFS entityAnnotation = jcas.getCas().createAnnotation(type, annotation.getBegin(), annotation.getEnd()); + + entityAnnotation.setStringValue(entityFeature, annotation.getCoveredText()); + + String name = "OTHER"; // "OTHER" makes no sense. In practice, "PERSON", "COUNTRY", "E-MAIL", etc. + if (annotation.getCoveredText().equals("Apache")) + name = "ORGANIZATION"; + entityAnnotation.setStringValue(nameFeature, name); + + jcas.addFsToIndexes(entityAnnotation); + } + } + } + +} diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyPoSTagger.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyPoSTagger.java new file mode 100644 index 00000000000..a120bce3997 --- /dev/null +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyPoSTagger.java @@ -0,0 +1,57 @@ +package org.apache.lucene.analysis.uima.an; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.uima.TokenAnnotation; +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Feature; +import org.apache.uima.cas.Type; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; + +/** + */ +public class DummyPoSTagger extends JCasAnnotator_ImplBase { + + private static final String NUM = "NUM"; + private static final String WORD = "WORD"; + private static final String TYPE_NAME = "org.apache.uima.TokenAnnotation"; + private static final String FEATURE_NAME = "tokenType"; + + @Override + public void process(JCas jcas) throws AnalysisEngineProcessException { + Type type = jcas.getCas().getTypeSystem().getType(TYPE_NAME); + Feature posFeature = type.getFeatureByBaseName(FEATURE_NAME); + + for (Annotation annotation : jcas.getAnnotationIndex(TokenAnnotation.type)) { + String text = annotation.getCoveredText(); + String pos = extractPoS(text); + annotation.setStringValue(posFeature, pos); + } + } + + private String extractPoS(String text) { + try { + Double.valueOf(text); + return NUM; + } catch (Exception e) { + return WORD; + } + } +}