t2hk · Feb 11, 2020
diff --git a/‎LICENSE
+202 b/‎LICENSE
+202
diff --git a/‎README.md
+90 b/‎README.md
+90
diff --git a/‎docker-compose.yml
+44 b/‎docker-compose.yml
+44
diff --git a/‎elasticsearch/dockerfile_elastic
+12 b/‎elasticsearch/dockerfile_elastic
+12
diff --git a/‎elasticsearch/sudachi.json
+25 b/‎elasticsearch/sudachi.json
+25
diff --git a/‎init.sh
+11 b/‎init.sh
+11
diff --git a/‎jupyter/data/Elasticsearch_sim_search.ipynb
+438 b/‎jupyter/data/Elasticsearch_sim_search.ipynb
+438
diff --git a/‎jupyter/data/LDA_topic_model.ipynb
+399 b/‎jupyter/data/LDA_topic_model.ipynb
+399
diff --git a/‎jupyter/data/Similarity.ipynb
+84 b/‎jupyter/data/Similarity.ipynb
+84
diff --git a/‎jupyter/data/es_accident_schema.txt
+77 b/‎jupyter/data/es_accident_schema.txt
+77
diff --git a/‎jupyter/data/es_accident_tokenize.py
+95 b/‎jupyter/data/es_accident_tokenize.py
+95
diff --git a/‎jupyter/data/es_anzen_schema.txt
+111 b/‎jupyter/data/es_anzen_schema.txt
+111
diff --git a/‎jupyter/data/es_anzen_tokenize.py
+137 b/‎jupyter/data/es_anzen_tokenize.py
+137
diff --git a/‎jupyter/data/excel_to_csv.py
+42 b/‎jupyter/data/excel_to_csv.py
+42
diff --git a/‎jupyter/data/get_doc.sh
+38 b/‎jupyter/data/get_doc.sh
+38
diff --git a/‎jupyter/data/glove.sh
+42 b/‎jupyter/data/glove.sh
+42
diff --git a/‎jupyter/data/html_to_json.py
+58 b/‎jupyter/data/html_to_json.py
+58
diff --git a/‎jupyter/data/load_accident_es.py
+36 b/‎jupyter/data/load_accident_es.py
+36
diff --git a/‎jupyter/data/load_anzen_bulk_es.sh
+6 b/‎jupyter/data/load_anzen_bulk_es.sh
+6
diff --git a/‎jupyter/data/merge_csv.py
+31 b/‎jupyter/data/merge_csv.py
+31
diff --git a/‎jupyter/data/nlp_book.ipynb
+439 b/‎jupyter/data/nlp_book.ipynb
+439
diff --git a/‎jupyter/data/scdv.py
+275 b/‎jupyter/data/scdv.py
+275
diff --git a/‎jupyter/data/scdv_to_es.py
+60 b/‎jupyter/data/scdv_to_es.py
+60
diff --git a/‎jupyter/dockerfile_jupyter
+19 b/‎jupyter/dockerfile_jupyter
+19
diff --git a/‎jupyter/requirements.txt
+15 b/‎jupyter/requirements.txt
+15
diff --git a/‎jupyter/sudachi.json
+27 b/‎jupyter/sudachi.json
+27
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2014 The Board of Trustees of The Leland Stanford Junior University
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
@@ -0,0 +1,90 @@
+# 自然言語処理用Docker環境
+
+## 概要
+GloVeによる単語ベクトル、SCDVによる文章ベクトルを活用し、類義語検索や類義文検索を行う。
+
+## 環境
+
+- ホスト環境
+
+| 環境 | バージョン |
+| --- | --- |
+| Ubuntu | 18.04 |
+| Docker | 18.09.7 |
+| Docker Compose | 1.17.1 |
+
+- Dockerイメージ
+
+| 環境 | バージョン |
+| --- | --- |
+| Jupyter Lab | jupyter/datascience-notebook |
+| Elasticsearch | docker.elastic.co/elasticsearch/elasticsearch:7.5.0 |
+| Kibana | docker.elastic.co/kibana/kibana:7.5.0 |
+
+## Dockerイメージ概要
+  -Jupyter Lab 
+    - Sudachi、Ginza導入
+    - matplitlib用の日本語フォント導入
+
+  - Elasticsearch 
+    - analysis-sudachi-elasticsearch導入
+
+## 構築方法
+- Dockerをインストールする。
+  ```
+  $ sudo apt update
+  $ sudo apt install docker docker-compose
+  ```
+- 初期設定を行う。
+  ```
+  $ ./init.sh
+  ```
+
+- docker-composeでコンテナを起動する。
+  ```
+  $ sudo docker-compose up
+  ```
+
+## 各環境へのアクセス方法
+- Jupyter Labコンテナ
+http://[ホスト]:8888
+
+- Kibana
+http://[ホスト]:5601
+
+- Elasticsearch
+http://[ホスト]:9800
+
+## 使い方
+基本的にJupyter Lab上での作業となる。
+
+### 1. 環境の初期設定、データクローリング、GloVeとSCDVによる単語・文章のベクトル化
+1. Jupyter Labコンテナにアクセスする。
+2. nlp_book.ipynbを開く
+3. セルの先頭から順番に実行する
+   主な処理は以下の通り。
+   - データクローリング
+   - データの前処理
+   - Elasticsearchへの登録
+   - Elasticsearchによる文章のトークナイズ
+   - GloVeによる単語ベクトルの生成
+   - SCDVによる文章ベクトルの生成とElasticsearchへの登録
+
+### 2. 類似語の抽出
+GloVeによる単語ベクトルを使い、類似語を抽出する。
+
+1. Similarity.ipynbを開く。
+2. 最初のセルに類似語を抽出したい単語と、上位何件を取得するか変数に記述する。
+   - word : 類似語を抽出したい単語
+   - top_k : 上位何位まで取得するか
+3. セルを実行する。
+
+### 3. LDAトピックモデル
+トークナイズした単語を使い、LDAトピックモデルによる分類を行う。
+
+1. LDA_topic_model.ipynbを開く。
+2. セルを上から順に実行する。
+   - LDAトピックモデルの学習
+   - 学習したトピック毎に特徴的な単語をWordCloudで可視化
+   - pyLDAvisによるトピックの分布を可視化
+
@@ -0,0 +1,44 @@
+version: "3"
+
+services:
+  elasticsearch:
+    build: 
+      context: ./elasticsearch
+      dockerfile: dockerfile_elastic
+    environment:
+      - discovery.type=single-node
+      - cluster.name=docker-cluster
+      - bootstrap.memory_lock=true
+      - xpack.security.enabled=false
+      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+    ports:
+      - 9200:9200
+    volumes:
+      - ./elasticsearch/es-data:/usr/share/elasticsearch/data
+  kibana:
+    image: docker.elastic.co/kibana/kibana:7.5.0
+    ports:
+      - 5601:5601
+  jupyter:
+    build:
+      context: ./jupyter
+      dockerfile: dockerfile_jupyter
+    user: root
+    environment:
+      #NB_UID: 500
+      #NB_GID: 100
+      NB_UID: 1000
+      NB_GID: 1000
+      GRANT_SUDO: "yes"
+      TZ: "Asia/Tokyo"
+      JUPYTER_ENABLE_LAB: "yes"
+    ports:  
+      - "8888:8888"
+    volumes:
+      - "./jupyter/data:/home/jovyan/work"
+    privileged: true
+    command: start.sh jupyter lab --NotebookApp.token='' --no-browser
@@ -0,0 +1,12 @@
+ARG ELASTIC_VER=7.5.0
+ARG ELASTIC_SUDACHI_VER=${ELASTIC_VER}-1.3.2
+ARG SUDACHI_VER=0.3.2
+
+FROM docker.elastic.co/elasticsearch/elasticsearch:${ELASTIC_VER}
+
+COPY sudachi.json /usr/share/elasticsearch/config/sudachi/
+COPY analysis-sudachi-elasticsearch7.5-1.3.2.zip /tmp/
+COPY system_full.dic /usr/share/elasticsearch/config/sudachi/
+
+RUN elasticsearch-plugin install file:///tmp/analysis-sudachi-elasticsearch7.5-1.3.2.zip && \
+rm /tmp/analysis-sudachi-elasticsearch7.5-1.3.2.zip 
@@ -0,0 +1,25 @@
+{
+    "systemDict" : "system_full.dic",
+    "inputTextPlugin" : [
+        { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" },
+        { "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin",
+          "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"],
+          "replacementSymbol": "ー"}
+    ],
+    "oovProviderPlugin" : [
+        { "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin" },
+        { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
+          "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ],
+          "leftId" : 5968,
+          "rightId" : 5968,
+          "cost" : 3857 }
+    ],
+    "pathRewritePlugin" : [
+        { "class" : "com.worksap.nlp.sudachi.JoinNumericPlugin",
+          "joinKanjiNumeric" : true },
+        { "class" : "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin",
+          "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
+          "minLength" : 3
+        }
+    ]
+}
@@ -0,0 +1,11 @@
+cd ./elasticsearch
+mkdir es-data
+
+wget https://object-storage.tyo2.conoha.io/v1/nc_2520839e1f9641b08211a5c85243124a/sudachi/sudachi-dictionary-20200127-full.zip
+
+unzip sudachi-dictionary-20200127-full.zip
+
+mv sudachi-dictionary-20200127/system_full.dic .
+
+wget https://github.com/WorksApplications/elasticsearch-sudachi/releases/download/v7.5.0-1.3.2/analysis-sudachi-elasticsearch7.5-1.3.2.zip
+
@@ -0,0 +1,77 @@
+{
+  "aliases" : {},
+  "mappings":{
+    "properties" : {
+      "sentence" : {
+        "type" : "text"
+      },
+      "category" : {
+        "type" : "keyword"
+      },
+      "scdv_vector" : {
+        "type" : "dense_vector",
+        "dims" : 1000
+      }
+    }
+  },
+  "settings": {
+    "index": {
+      "analysis": {
+        "tokenizer": {
+          "sudachi_tokenizer": {
+              "mode" : "search",
+              "settings_path" : "/usr/share/elasticsearch/config/sudachi/sudachi.json",
+              "resources_path" : "/usr/share/elasticsearch/config/sudachi/",
+              "type" : "sudachi_tokenizer",
+              "discard_punctuation" : "true"
+          }
+        },
+        "analyzer": {
+          "sudachi_analyzer": {
+            "filter": [
+              "sudachi_baseform",
+              "lowercase",
+              "my_posfilter",
+              "my_stopfilter"
+	    ],
+            "tokenizer": "sudachi_tokenizer",
+            "type": "custom"
+          }
+        },
+        "filter":{
+         "my_posfilter":{
+          "type":"sudachi_part_of_speech",
+          "stoptags":[
+            "接続詞","助動詞","助詞","記号","補助記号","名詞,数詞",
+            "名詞,普通名詞,助数詞可能"
+            ]
+         },"my_stopfilter":{
+          "type":"sudachi_ja_stop",
+          "stopwords":[
+            "は",
+            "です",
+            "する",
+            "いる",
+            "ため",
+            "ＣＭ",
+            "cm",
+            "CM",
+            "次",
+            "名",
+            "行う",
+            "等",
+            "者",
+            "際",
+            "こと",
+            "ある",
+            "この",
+            "その",
+            "そこ",
+            "これ"
+          ]
+         }
+        }
+      }
+    }
+  }
+}
@@ -0,0 +1,95 @@
+import elasticsearch
+import json, argparse, scdv
+
+class elasticsearchClient():
+    def __init__(self, host, port, index):
+        self.host = host
+        self.port = port
+        self.index = index
+        self.client = elasticsearch.Elasticsearch(self.host + ":" + self.port)
+
+    # sentenceをトークナイズする。
+    def tokenize(self, sentence):
+        body_ = {"analyzer": "sudachi_analyzer", "text": sentence}
+        json_tokens = self.client.indices.analyze(
+                index = self.index, body=body_)
+
+        tokens = [token['token'] for token in json_tokens['tokens']]
+        return tokens
+
+    def parse_data(self, items):
+        results = []
+
+        for item in items:
+            index = json.dumps(item['_id'])
+            category = json.dumps(
+                    item['_source']['category'], 
+                    indent=2, ensure_ascii=False)
+            sentence = json.dumps(
+                    item['_source']['sentence'], 
+                    indent=2, ensure_ascii=False)
+
+            tokens = self.tokenize(sentence)
+            results.append((index, category, sentence, tokens))
+        return results
+
+    # 全データを取得する
+    def get_all_data(self, scroll_time, scroll_size):
+        results = []
+
+        data = self.client.search(
+                index = self.index,
+                scroll = scroll_time,
+                size = scroll_size,
+                body = {})
+        sid = data['_scroll_id']
+        scroll_size = len(data['hits']['hits'])
+
+        results = self.parse_data(data['hits']['hits'])
+
+        while scroll_size > 0:
+            data = self.client.scroll(
+                    scroll_id = sid, 
+                    scroll = scroll_time)
+
+            sid = data['_scroll_id']
+            scroll_size = len(data['hits']['hits'])
+            scroll_results = self.parse_data(data['hits']['hits'])
+            results.extend(scroll_results)
+
+        return results
+    def update(self, row_id, body):
+        response = self.client.update(
+                index = self.index, 
+                id = row_id, 
+                body = body)
+        print(response)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--host', type=str)
+    parser.add_argument('--port', type=str, default='9200')
+    parser.add_argument('--index', type=str)
+    parser.add_argument('--output', type=str)
+    parser.add_argument('--scroll_limit', type=str, default='1m')
+    parser.add_argument('--scroll_size', type=int, default=100)
+
+    return parser.parse_args()
+
+def main(args):
+    client = elasticsearchClient(args.host, args.port, args.index)
+    results = client.get_all_data(args.scroll_limit, args.scroll_size)
+
+    output_txt = args.output + '.txt'
+    output_csv = args.output + '.csv'
+    with open(output_csv, "w") as f_csv:
+        with open(output_txt, "w") as f_txt:
+            f_csv.writelines('ID,category,sentence,tokens\n')
+
+            for result in results:
+                tokens = " ".join(result[3])
+                f_csv.writelines(result[0] + ',' + result[1] + ',' + result[2] + ',"' + tokens + '"\n')
+                f_txt.writelines(tokens + '\n')
+
+if __name__ == '__main__':
+    main(parse_args())
@@ -0,0 +1,111 @@
+{
+  "aliases" : {},
+  "mappings":{
+    "properties" : {
+      "title" : {
+        "type" : "nested",
+        "properties" : {
+          "title_id" : {"type" : "keyword"},
+          "text" : { "type" : "text"  },
+          "vector" : {
+            "type" : "dense_vector",
+            "dims" : 1000
+          }
+        }
+      },
+      "situation" : {
+        "type" : "nested",
+        "properties" : {
+          "situation_id" : {"type" : "keyword"},
+          "text" : { "type" : "text"  },
+          "vector" : {
+            "type" : "dense_vector",
+            "dims" : 1000
+          }
+        }
+      },
+      "cause" : {
+        "type" : "nested",
+        "properties" : {
+          "cause_id" : {"type" : "keyword"},
+          "text" : { "type" : "text"  },
+          "vector" : {
+            "type" : "dense_vector",
+            "dims" : 1000
+          }
+        }
+      },
+      "measures" : {
+        "type" : "nested",
+        "properties" : {
+          "measures_id" : {"type" : "keyword"},
+          "text" : { "type" : "text"  },
+          "vector" : {
+            "type" : "dense_vector",
+            "dims" : 1000
+          }
+        }
+      }    
+    }    
+  },
+  "settings": {
+    "index": {
+      "analysis": {
+        "tokenizer": {
+          "sudachi_tokenizer": {
+              "mode" : "search",
+              "settings_path" : "/usr/share/elasticsearch/config/sudachi/sudachi.json",
+              "resources_path" : "/usr/share/elasticsearch/config/sudachi/",
+              "type" : "sudachi_tokenizer",
+              "discard_punctuation" : "true"
+          }
+        },
+        "analyzer": {
+          "sudachi_analyzer": {
+            "filter": [
+              "sudachi_baseform",
+              "lowercase",
+              "my_posfilter",
+              "my_stopfilter"
+	    ],
+            "tokenizer": "sudachi_tokenizer",
+            "type": "custom"
+          }
+        },
+        "filter":{
+         "my_posfilter":{
+          "type":"sudachi_part_of_speech",
+          "stoptags":[
+            "接続詞","助動詞","助詞","記号","補助記号","名詞,数詞",
+            "名詞,普通名詞,助数詞可能"
+            ]
+         },"my_stopfilter":{
+          "type":"sudachi_ja_stop",
+          "stopwords":[
+            "は",
+            "です",
+            "する",
+            "いる",
+            "ため",
+            "ＣＭ",
+            "cm",
+            "CM",
+            "次",
+            "名",
+            "行う",
+            "等",
+            "者",
+            "際",
+            "こと",
+            "ある",
+            "この",
+            "その",
+            "そこ",
+            "これ"
+          ]
+         }
+        }
+      }
+    }
+  }
+}
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+import elasticsearch
+import json, argparse
+
+class elasticsearchClient():
+    def __init__(self, host, port, index):
+        self.host = host
+        self.port = port
+        self.index = index
+        self.client = elasticsearch.Elasticsearch(self.host + ":" + self.port)
+
+    # 文章をトークナイズする。
+    def tokenize(self, sentence):
+        body_ = {"analyzer": "sudachi_analyzer", "text": sentence}
+        json_tokens = self.client.indices.analyze(
+                index = self.index, body=body_)
+
+        tokens = [token['token'] for token in json_tokens['tokens']]
+        return tokens
+
+    def parse_data(self, items):
+        results = []
+
+        for item in items:
+            index = json.dumps(item['_id'])
+            title = json.dumps(
+                    item['_source']['title']['text'], 
+                    indent=2, ensure_ascii=False)
+            title_id = json.dumps(
+                    item['_source']['title']['title_id'], 
+                    indent=2, ensure_ascii=False)
+
+            _cause = item['_source']['cause'] 
+            cause = []
+            cause_id = []
+            for val in _cause:
+                cause.append(json.dumps(val['text'], ensure_ascii=False))
+                cause_id.append(json.dumps(val['cause_id'], ensure_ascii=False))
+
+            situation = []
+            situation_id = []
+            _situation = item['_source']['situation'] 
+            for val in _situation:
+                situation.append(json.dumps(val['text'], ensure_ascii=False))
+                situation_id.append(json.dumps(val['situation_id'], ensure_ascii=False))
+
+            measures = []
+            measures_id = []
+            _measures = item['_source']['measures']
+            for val in _measures:
+                measures.append(json.dumps(val['text'], ensure_ascii=False))
+                measures_id.append(json.dumps(val['measures_id'], ensure_ascii=False))
+            title_tokens = self.tokenize(title)
+            if len(title_tokens) > 0:
+              results.append((index, "title", title_id, title, title_tokens)) 
+
+            for (id, val) in zip(cause_id, cause):
+              val_tokens = self.tokenize(val)
+              if len(val_tokens) > 0:
+                results.append((index, "cause", id, val, val_tokens)) 
+
+            for (id, val) in zip(situation_id, situation):
+              val_tokens = self.tokenize(val)
+              if len(val_tokens) > 0:
+                results.append((index, "situation", id, val, val_tokens)) 
+
+            for (id, val) in zip(measures_id, measures):
+              val_tokens = self.tokenize(val)
+              if len(val_tokens) > 0:
+                results.append((index, "measures", id, val, val_tokens)) 
+
+        return results
+
+    # 全データを取得する
+    def get_all_data(self, scroll_time, scroll_size):
+        results = []
+
+        data = self.client.search(
+                index = self.index,
+                scroll = scroll_time,
+                size = scroll_size,
+                body = {})
+        sid = data['_scroll_id']
+        scroll_size = len(data['hits']['hits'])
+
+        results = self.parse_data(data['hits']['hits'])
+
+        while scroll_size > 0:
+            data = self.client.scroll(
+                    scroll_id = sid, 
+                    scroll = scroll_time)
+
+            sid = data['_scroll_id']
+            scroll_size = len(data['hits']['hits'])
+            scroll_results = self.parse_data(data['hits']['hits'])
+
+            results.extend(scroll_results)
+
+        return results
+
+    def update(self, row_id, body):
+        response = self.client.update(
+                index = self.index, 
+                id = row_id, 
+                body = body)
+        print(response)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--host', type=str, default='localhost')
+    parser.add_argument('--port', type=str, default='9200')
+    parser.add_argument('--index', type=str)
+    parser.add_argument('--output', type=str)
+    parser.add_argument('--scroll_limit', type=str, default='1m')
+    parser.add_argument('--scroll_size', type=int, default=100)
+
+    return parser.parse_args()
+
+def main(args):
+    client = elasticsearchClient(args.host, args.port, args.index)
+    results = client.get_all_data(args.scroll_limit, args.scroll_size)
+
+    output_csv = args.output + '.csv'
+    output_txt = args.output + '.txt'
+    #output_txt = args.output.replace(".csv", ".txt")
+    #with open(args.output, "w") as f_csv:
+    with open(output_csv, "w") as f_csv:
+        with open(output_txt, "w") as f_txt:
+            f_csv.writelines('ID,種別,文章ID,文章,分かち書き\n')
+
+            for result in results:
+                tokens = " ".join(result[4])
+                f_csv.writelines(result[0] + ',' + '"' + result[1] + '",' + result[2] + ',' + result[3].strip() + ',"' + tokens + '"\n')
+                f_txt.writelines(tokens + '\n')
+
+if __name__ == '__main__':
+    main(parse_args())
@@ -0,0 +1,42 @@
+import pandas as pd
+import glob, sys
+
+args = sys.argv
+excel_dir = args[1]
+output_csv_dir = args[2]
+
+files = glob.glob(excel_dir + '/*.xls*')
+
+for file in files:
+  excel = pd.ExcelFile(file)
+
+  print(file)
+
+  sheet_names = excel.sheet_names
+  for i, name in enumerate(sheet_names):
+    if i > 0:
+        continue
+
+    csv_file = file.replace(excel_dir, output_csv_dir).replace(".xlsx", "").replace(".xls", "") + '_' + str(i) + '.csv'
+    sheet_df = excel.parse(name, header=[0, 1])
+    columns_val = sheet_df.columns.values
+
+    col_names = []
+    for col_vals in columns_val:
+        # セル結合されているタイトル行は_で文字列結合する。
+        # タイトルの分類名の括弧が全半角混在のため、半角に統一する。
+        col_name = col_vals[0].replace('\n', '') + '_' + col_vals[1].replace('\n','')
+        col_name = col_name.replace('（','(').replace('）',')')
+
+        if 'Unnamed' in col_vals[1]:
+            col_name = col_vals[0].replace('\n','')
+        col_names.append(col_name)
+    sheet_df.columns = col_names
+
+    situation_col_name = '災害状況'
+    if 'kikaisaigai' in file:
+        situation_col_name = '災害発生状況'
+
+    sheet_df[situation_col_name] = sheet_df[situation_col_name].replace('\r\n','', regex=True).replace('\r','', regex=True).replace('\n','', regex=True)
+
+    sheet_df.to_csv(csv_file)
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+for ((i=1 ; i<1131 ; i++))
+do
+	num=${i}
+	file=./html/anzen_${num}.html
+
+	echo ${num}
+        wget https://anzeninfo.mhlw.go.jp/anzen_pg/SAI_DET.aspx?joho_no=${num} -O ${file}
+done
+
+for ((i=100003 ; i<101583 ; i++))
+do
+	num=${i}
+	file=./html/anzen_${num}.html
+
+	echo ${num}
+        wget https://anzeninfo.mhlw.go.jp/anzen_pg/SAI_DET.aspx?joho_no=${num} -O ${file}
+done
+
+for ((i=3 ; i<30 ; i++))
+do
+        num=${i}
+        if [ ${i} -lt 10 ]; then
+                num=0${i}
+        fi
+
+        file=sibou_db_h${num}.xlsx
+
+        if [ ${i} -lt 27 ]; then
+          file=sibou_db_h${num}.xls
+        fi
+
+        echo ${file}
+        wget https://anzeninfo.mhlw.go.jp/anzen/sib_xls/${file} -P ./excel/
+done
+
+wget https://anzeninfo.mhlw.go.jp/anzen/sai/kikaisaigai_db28.xlsx -P ./excel/
@@ -0,0 +1,42 @@
+#!/bin/bash
+set -e
+
+# Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
+# One optional argument can specify the language used for eval script: matlab, octave or [default] python
+
+
+CORPUS=tokenized/all_tokens.txt
+VOCAB_FILE=./vector/glove_vocab.txt
+COOCCURRENCE_FILE=./vector/glove_cooccurrence.bin
+COOCCURRENCE_SHUF_FILE=./vector/glove_cooccurrence_shuf.bin
+BUILDDIR=GloVe/build
+SAVE_FILE=./vector/glove_vectors
+VERBOSE=2
+MEMORY=4.0
+VOCAB_MIN_COUNT=0
+VECTOR_SIZE=50
+MAX_ITER=50
+WINDOW_SIZE=15
+BINARY=2
+NUM_THREADS=8
+X_MAX=10
+
+echo
+echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE"
+$BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
+echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE"
+$BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE
+echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE"
+$BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
+echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE"
+$BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE
+if [ "$CORPUS" = 'text8' ]; then
+   if [ "$1" = 'matlab' ]; then
+       matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2 
+   elif [ "$1" = 'octave' ]; then
+       octave < ./eval/octave/read_and_evaluate_octave.m 1>&2
+   else
+       echo "$ python eval/python/evaluate.py"
+       python eval/python/evaluate.py
+   fi
+fi
@@ -0,0 +1,58 @@
+import re, json, glob, os.path, sys
+from bs4 import BeautifulSoup
+
+args = sys.argv
+html_dir = args[1]
+json_dir = args[2]
+
+html_files = glob.glob(html_dir + "*")
+
+for html_file in html_files:
+    file_size = os.path.getsize(html_file)
+    if file_size < 10000:
+        print("[ERROR] {}".format(html_file))
+        continue
+
+    print(html_file)
+    #json_file = html_file.replace("./html/", "./elastic_json/").replace(".html", ".json")
+    json_file = html_file.replace(html_dir, json_dir).replace(".html", ".json")
+
+    id = html_file.replace(html_dir + "anzen_","").replace(".html","")
+
+    html = BeautifulSoup(open(html_file, encoding="cp932"), 'html.parser')
+
+    for i in html.select("br"):
+      i.replace_with("\n")
+
+    title = html.find('table').find('h1').text.strip()
+    title_id = id + '_t_0'
+
+    _cause = html.find("img", alt="原因").find_parent().find_parent().find('td').text.strip().replace("\u3000", "").split("\n")
+    cause = []
+
+    for i, val in enumerate(_cause):
+        val = val.strip().replace("\t","").replace("\n","")
+        if len(val) > 0:
+            cause.append('{"cause_id":"%s", "text":"%s"}' % (id + '_c_' + str(i), val))
+    cause = ",".join(cause)
+
+    situation = []
+    _situation = html.find("img", alt="発生状況").find_parent().find_parent().find('td').text.strip().replace("\u3000", "").split("\n")
+    for i, val in enumerate(_situation):
+        val = val.strip().replace("\t","").replace("\n","")
+        if len(val) > 0:
+            situation.append('{"situation_id":"%s", "text":"%s"}' % (id + '_s_' + str(i), val))
+    situation = ",".join(situation)
+
+    _measures = html.find("img", alt="対策").find_parent().find_parent().find('td').text.strip().replace("\u3000", "").split("\n")
+    measures = [] 
+    for i, val in enumerate(_measures):
+        val = val.strip().replace("\t","").replace("\n","")
+        if len(val) > 0:
+            measures.append('{"measures_id":"%s", "text":"%s"}' % (id + '_m_' + str(i), val))
+    measures = ",".join(measures)
+
+    json_data = '{"index":{"_index":"anzen","_id":"%s"}},\n{"title":{"title_id":"%s", "text":"%s"},"situation":[%s],"cause":[%s],"measures":[%s]}' % (id, title_id, title, situation, cause, measures)
+
+    with open(json_file, "w") as jw:
+        jw.writelines(json_data + "\n\n")
@@ -0,0 +1,36 @@
+import pandas as pd
+import json, argparse, glob
+from elasticsearch import Elasticsearch
+from elasticsearch import helpers
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--host', type=str)
+    parser.add_argument('--port', type=str, default='9200')
+    parser.add_argument('--index', type=str)
+    parser.add_argument('--input_dir', type=str)
+
+    return parser.parse_args()
+
+def main(args):
+    csv_files = glob.glob(args.input_dir + '/*.csv')
+
+    for csv_file in csv_files:
+      situation_col_name = '災害状況'
+      if 'kikaisaigai' in csv_file:
+          situation_col_name = '災害発生状況'
+
+      df = pd.read_csv(csv_file, encoding='utf-8', header=0)
+      sentences = df[situation_col_name]
+      categories = df['業種(大分類)_分類名']
+
+      es = Elasticsearch(host=args.host, port=args.port)
+
+      for col, sentence in enumerate(sentences):
+         json_data = '{"category":"%s","sentence":"%s"}' % (categories[col], sentence)
+
+         print(json_data)
+         es.index(index=args.index, doc_type="_doc", body=json_data)
+
+if __name__ == '__main__':
+  main(parse_args())
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+for FILE in `find ./json/ -maxdepth 1 -type f`; do
+	echo ${FILE}
+	curl -X POST -H "Content-Type: application/json" "elasticsearch:9200/anzen/_bulk?pretty" --data-binary @${FILE}
+done
@@ -0,0 +1,31 @@
+import pandas as pd
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_anzen', type=str)
+    parser.add_argument('--input_accident', type=str)
+    parser.add_argument('--output_csv', type=str)
+
+    return parser.parse_args()
+
+def main(args):
+    anzen_csv = args.input_anzen
+    accident_csv = args.input_accident
+    merge_csv = args.output_csv
+
+    anzen_df = pd.read_csv(anzen_csv)
+    accident_df = pd.read_csv(accident_csv)
+
+    # accidentインデックスのデータのIDと分かち書きのみ取得し、INDEX列を追加する
+    new_accident_df = accident_df.drop('sentence', axis=1).assign(index = 'accident').assign(sentence_id = 0)
+
+    new_anzen_df = anzen_df.rename(columns={'種別':'category'}).drop('文章', axis=1).assign(index = 'anzen')
+    new_anzen_df = new_anzen_df.rename(columns={'文章ID':'sentence_id'}).rename(columns={'分かち書き':'tokens'})
+
+    merge_df = pd.concat([new_anzen_df, new_accident_df], sort=False)
+    merge_df.to_csv(merge_csv, encoding='utf_8')
+
+if __name__ == '__main__':
+    main(parse_args())
@@ -0,0 +1,275 @@
+import logging, argparse, pickle, time
+import numpy as np
+import pandas as pd
+import lightgbm as lgb
+from gensim.models import KeyedVectors
+from tqdm import tqdm
+from sklearn.mixture import GaussianMixture
+from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report
+
+class SparseCompositeDocumentVectors:
+    def __init__(self, num_clusters,  pname1, pname2):
+        self.min_no = 0
+        self.max_no = 0
+        self.prob_wordvecs = {}
+        
+        #### 読み込むファイルの設定
+        # GloVeの単語ベクトルファイル
+        self.glove_word_vector_file = "vector/glove_vectors.txt"
+
+        #### 出力するファイルの設定
+        # GloVeの単語ベクトルに単語数とベクトルサイズを付与したファイル
+        self.gensim_glove_word_vector_file = "vector/gensim_glove_vectors.txt"
+        
+        # GMMの結果を保存するPickleファイル
+        self.pname1 = pname1
+        self.pname2 = pname2
+
+        #### その他パラメータ
+        # GMMのクラスタ数
+        self.num_clusters = num_clusters
+        
+        # GloVeの次元数
+        self.num_features = 50
+    
+    def load_glove_vector(self):
+        # GloVeの単語ベクトルファイルを読み込み、単語数とベクトルサイズを付与した処理用のファイルを作成する。
+        vectors = pd.read_csv(self.glove_word_vector_file, delimiter=' ', index_col=0, header=None)
+        
+        vocab_count = vectors.shape[0]  # 単語数
+        self.num_features = vectors.shape[1]  # 次元数
+
+        with open(self.glove_word_vector_file, 'r') as original, open(self.gensim_glove_word_vector_file, 'w') as transformed:
+            transformed.write(f'{vocab_count} {self.num_features}\n')
+            transformed.write(original.read())  # 2行目以降はそのまま出力
+
+        # GloVeの単語ベクトルを読み込む
+        self.glove_vectors = KeyedVectors.load_word2vec_format(self.gensim_glove_word_vector_file, binary=False)
+
+    def cluster_GMM2(self):   
+        glove_vectors = self.glove_vectors.vectors
+        
+        # Initalize a GMM object and use it for clustering.
+        gmm_model = GaussianMixture(n_components=num_clusters, covariance_type="tied", init_params='kmeans', max_iter=100)
+        # Get cluster assignments.
+        gmm_model.fit(glove_vectors)
+        idx = gmm_model.predict(glove_vectors)
+        print ("Clustering Done...")
+        # Get probabilities of cluster assignments.
+        idx_proba = gmm_model.predict_proba(glove_vectors)
+        # Dump cluster assignments and probability of cluster assignments. 
+        pickle.dump(idx, open(self.pname1,"wb"))
+        print ("Cluster Assignments Saved...")
+
+        pickle.dump(idx_proba,open(self.pname2, "wb"))
+        print ("Probabilities of Cluster Assignments Saved...")
+        return (idx, idx_proba)        
+        
+    def cluster_GMM(self):
+        # GMMによるクラスタリング
+        
+        clf = GaussianMixture(
+            n_components=self.num_clusters,
+            covariance_type="tied",
+            init_params="kmeans",
+            max_iter=50
+        )
+        
+        glove_vectors = self.glove_vectors.vectors
+        # Get cluster assignments.
+        clf.fit(glove_vectors)
+        idx = clf.predict(glove_vectors)
+        print("Clustering Done...")
+        # Get probabilities of cluster assignments.
+        idx_proba = clf.predict_proba(glove_vectors)
+        # Dump cluster assignments and probability of cluster assignments.
+        pickle.dump(idx, open(self.pname1, "wb"))
+        print("Cluster Assignments Saved...")
+        pickle.dump(idx_proba, open(self.pname2, "wb"))
+        print("Probabilities of Cluster Assignments saved...")
+        return (idx, idx_proba)
+
+    def read_GMM(self):
+        # GMMモデルを読み込む。
+        
+        idx = pickle.load(open(self.idx_name, "rb"))
+        idx_proba = pickle.load(open(self.idx_proba_name, "rb"))
+        print("Cluster Model Loaded...")
+        return (idx, idx_proba)
+
+    def get_idf_dict(self, corpus):
+        # IDFを算出する。
+        # corpus : 分かち書きした文章のリスト
+        
+        # 単語の数をカウントする
+        count_vectorizer = CountVectorizer()
+        X_count = count_vectorizer.fit_transform(corpus)
+
+        # scikit-learn の TF-IDF 実装
+        tfidf_vectorizer = TfidfVectorizer(token_pattern="(?u)\\b\\w+\\b")
+        X_tfidf = tfidf_vectorizer.fit_transform(corpus)
+
+        feature_names = tfidf_vectorizer.get_feature_names()
+        idf = tfidf_vectorizer.idf_
+
+        word_idf_dict = {}
+        for pair in zip(feature_names, idf):
+            word_idf_dict[pair[0]] = pair[1]
+        
+        return feature_names, word_idf_dict
+
+    def get_probability_word_vectors(self, corpus):
+        """
+        corpus: 分かち書き済みの文章のリスト
+        """
+        
+        # GloVeの単語ベクトルを読み込む。
+        self.load_glove_vector()
+        
+        # 単語毎のGMMクラスタの確率ベクトル
+        idx, idx_proba = self.cluster_GMM()
+ 
+        # 各単語が属する確率が高いクラスタのインデックス
+        word_centroid_map = dict(zip(self.glove_vectors.index2word, idx))
+        # 各単語が、各クラスタに属する確率
+        word_centroid_prob_map = dict(zip(self.glove_vectors.index2word, idx_proba))     
+        
+        # IDFを算出する。
+        featurenames, word_idf_dict = self.get_idf_dict(corpus)
+        
+        for word in word_centroid_map:
+            self.prob_wordvecs[word] = np.zeros(self.num_clusters * self.num_features, dtype="float32")
+            for index in range(self.num_clusters):
+                try:
+                    self.prob_wordvecs[word][index*self.num_features:(index+1)*self.num_features] = \
+                        self.glove_vectors[word] * word_centroid_prob_map[word][index] * word_idf_dict[word]
+                except:
+                    continue
+        self.word_centroid_map = word_centroid_map
+
+    def create_cluster_vector_and_gwbowv(self, tokens, flag):
+        # SDV(Sparse Document Vector)を組み立てる。
+        
+        bag_of_centroids = np.zeros(self.num_clusters * self.num_features, dtype="float32")
+        for token in tokens:
+            try:
+                temp = self.word_centroid_map[token]
+            except:
+                continue
+            bag_of_centroids += self.prob_wordvecs[token]
+        norm = np.sqrt(np.einsum('...i,...i', bag_of_centroids, bag_of_centroids))
+        if norm != 0:
+            bag_of_centroids /= norm
+            
+        # 訓練で作成したベクトルをスパース化するために最小と最大を記録しておく。
+        if flag:
+            self.min_no += min(bag_of_centroids)
+            self.max_no += max(bag_of_centroids)
+        return bag_of_centroids
+
+    def make_gwbowv(self, corpus, train=True):
+        # ドキュメントベクトルのマトリクスを作成する。
+        # gwbowvには通常のドキュメントベクトルが格納される。
+        gwbowv = np.zeros((len(corpus), self.num_clusters*self.num_features)).astype(np.float32)
+        cnt = 0
+        for tokens in tqdm(corpus):
+            gwbowv[cnt] = self.create_cluster_vector_and_gwbowv(tokens, train)
+            cnt += 1
+
+        return gwbowv
+
+    def dump_gwbowv(self, gwbowv, path="gwbowv_matrix.npy", percentage=0.04):
+        # スパース化したドキュメントベクトルを保存する。
+        
+        # スパース化するための閾値を算出する。
+        min_no = self.min_no*1.0/gwbowv.shape[0]
+        max_no = self.max_no*1.0/gwbowv.shape[0]
+        print("Average min: ", min_no)
+        print("Average max: ", max_no)
+        thres = (abs(max_no) + abs(min_no))/2
+        thres = thres * percentage
+        
+        # 閾値未満のベクトルを0とし、スパース化する。
+        temp = abs(gwbowv) < thres
+        gwbowv[temp] = 0
+        np.save(path, gwbowv)
+        print("SDV created and dumped...")
+
+    def load_matrix(self, name):
+        return np.load(name)
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="GloVeとSCDVのパラメータの設定"
+    )
+    parser.add_argument('--csv_file', type=str)
+    parser.add_argument(
+        '--num_clusters', type=int, default=20
+    )
+    parser.add_argument(
+        '--pname1', type=str, default="vector/gmm_cluster.pkl"
+    )
+    parser.add_argument(
+        '--pname2', type=str, default="vector/gmm_prob_cluster.pkl"
+    )
+
+    return parser.parse_args()
+
+def build_model(csv_file, num_clusters, gmm_pname1, gmm_pname2):
+    df = pd.read_csv(csv_file)
+
+    index = df['index']
+    doc_id = df['ID']
+    sentence_id = df['sentence_id']
+    categories = df['category']
+    tokens = df['tokens']
+
+    vec = SparseCompositeDocumentVectors(num_clusters, gmm_pname1, gmm_pname2)
+    # 確率重み付き単語ベクトルを求める
+    vec.get_probability_word_vectors(tokens)
+    # データからSCDVを求める
+    gwbowv = vec.make_gwbowv(tokens)
+
+    print("sentence_id len:{}, gwbowv len:{}".format(len(sentence_id), len(gwbowv)))
+
+    return zip(index, doc_id, sentence_id, categories, gwbowv)
+
+def main(args):
+    df = pd.read_csv(args.csv_file)
+    categories = df['category'].unique()
+    NUM_TOPICS = len(categories)
+
+    # 訓練データとtestデータに分ける
+    train_data, test_data, train_label, test_label, train_id, test_id = train_test_split(
+        df['tokens'], df['category'], df['ID'],
+        test_size=0.1, train_size=0.9, stratify=df['category'], shuffle=True)
+
+    vec = SparseCompositeDocumentVectors(args.num_clusters, args.pname1, args.pname2)
+    # 確率重み付き単語ベクトルを求める
+    vec.get_probability_word_vectors(train_data)
+    # 訓練データからSCDVを求める
+    train_gwbowv = vec.make_gwbowv(train_data)
+    # テストデータからSCDVを求める
+    test_gwbowv = vec.make_gwbowv(test_data, False)
+
+    print("train size:{}  vector size:{}".format(len(train_gwbowv), len(train_gwbowv[0])))
+    print("test size:{}  vector size:{}".format(len(test_gwbowv), len(test_gwbowv[0])))
+
+    print("Test start...")
+
+    start = time.time()
+    clf = lgb.LGBMClassifier(objective="multiclass")
+    clf.fit(train_gwbowv, train_label)
+    test_pred = clf.predict(test_gwbowv)
+
+    # print(test_pred)
+
+    print ("Report")
+    print (classification_report(test_label, test_pred, digits=6))
+    print ("Accuracy: ",clf.score(test_gwbowv, test_label))
+    print ("Time taken:", time.time() - start, "\n")
+
+if __name__ == "__main__":
+    main(parse_args())
@@ -0,0 +1,60 @@
+import elasticsearch
+import json, argparse, scdv
+
+class elasticsearchClient():
+    def __init__(self, host, port):
+        self.host = host
+        self.port = port
+        self.client = elasticsearch.Elasticsearch(self.host + ":" + self.port, timeout=30)
+
+    def update(self, index, doc_id, body):
+        response = self.client.update(
+                index = index, 
+                id = doc_id, 
+                body = body)
+        print(response)
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--host', type=str, default='localhost')
+    parser.add_argument('--port', type=str, default='9200')
+    parser.add_argument('--input_csv', type=str)
+
+    return parser.parse_args()
+
+def create_script(sentence_type, sentence_id, vector):
+    script = ""
+
+    if sentence_type == "title":
+        script = '{"script":{"source":"ctx._source.title.vector = params.vector","lang":"painless","params":{"id":"' + sentence_id + '","vector":' + vector + '}}}'
+    else:
+        script = '{"script":{"source":"for (int i = 0; i < ctx._source.' + sentence_type + '.length; i++) {if(ctx._source.' + sentence_type + '[i].' + sentence_type + '_id == params.id) { ctx._source.' + sentence_type + '[i].vector = params.vector; break}}","lang":"painless","params":{"id":"' + sentence_id + '","vector":"' + vector + '"}}}'
+
+    return script
+
+def main(args):
+    client = elasticsearchClient(args.host, args.port)
+
+    scdv_vec = scdv.build_model(args.input_csv, 20, "gmm_cluster.pkl", "gmm_prob_cluster.pkl")
+
+    for index, doc_id, sentence_id, category, vector in scdv_vec:
+        if index == 'anzen':
+            sentence_type = "title"
+
+            if '_c_' in sentence_id:
+                sentence_type = "cause"
+            elif '_m_' in sentence_id:
+                sentence_type = "measures"
+            elif '_s_' in sentence_id:
+                sentence_type = "situation"
+
+            vector = str(vector.tolist())
+            script = create_script(sentence_type, sentence_id, vector)
+            script = script.replace('"[','[').replace(']"',']')
+
+            client.update(index, doc_id, script)
+        elif index == 'accident':
+            client.update(index, doc_id, {'doc':{'scdv_vector':vector.tolist()}})
+
+if __name__ == '__main__':
+    main(parse_args())
@@ -0,0 +1,19 @@
+FROM jupyter/datascience-notebook
+
+USER root
+COPY requirements.txt /tmp/ 
+RUN apt-get update -y && apt-get install vim sudo -y && \
+    python -m pip install --upgrade pip setuptools && \
+    python -m pip install -r /tmp/requirements.txt --no-cache-dir && \
+    python -m pip install https://object-storage.tyo2.conoha.io/v1/nc_2520839e1f9641b08211a5c85243124a/sudachi/SudachiDict_full-20200127.tar.gz && \
+    sudachipy link -t full && \
+    curl -L  "https://ipafont.ipa.go.jp/IPAexfont/ipaexg00201.zip" > font.zip && \
+    unzip font.zip && \
+    cp ipaexg00201/ipaexg.ttf /usr/share/fonts/truetype/ipaexg.ttf && \
+    echo "font.family : IPAexGothic" >>  /opt/conda/lib/python3.7/site-packages/matplotlib/mpl-data/matplotlibrc && \
+    rm -r ./.cache && \
+    jupyter serverextension enable --py jupyterlab && \
+    chown -R jovyan /opt/conda
+COPY sudachi.json /opt/conda/lib/python3.7/site-packages/sudachipy/resources/
+COPY sudachi.json /opt/conda/lib/python3.7/site-packages/ja_ginza_dict/sudachidict/
+WORKDIR /home/jovyan/work
@@ -0,0 +1,15 @@
+elasticsearch==7.0.4
+gensim==3.8.0
+lightgbm==2.3.0
+matplotlib==3.1.1
+numpy==1.17.4
+pandas==0.25.3
+pyLDAvis==2.1.2
+scikit-learn==0.22
+SudachiDict-full
+SudachiPy==0.4.2
+tqdm==4.40.2
+wordcloud==1.6.0
+xlrd==1.2.0
+ginza
+jupyterlab
@@ -0,0 +1,27 @@
+{
+    "systemDict": "/opt/conda/lib/python3.7/site-packages/sudachidict_full/resources/system.dic",
+    "characterDefinitionFile" : "char.def",
+    "inputTextPlugin" : [
+        { "class" : "sudachipy.plugin.input_text.DefaultInputTextPlugin" },
+        { "class" : "sudachipy.plugin.input_text.ProlongedSoundMarkInputTextPlugin",
+          "prolongedSoundMarks": ["ー", "-", "?", "?", "?"],
+          "replacementSymbol": "ー"}
+    ],
+    "oovProviderPlugin" : [
+        { "class" : "sudachipy.plugin.oov.MeCabOovProviderPlugin",
+          "charDef" : "char.def",
+          "unkDef" : "unk.def" },
+        { "class" : "sudachipy.plugin.oov.SimpleOovProviderPlugin",
+          "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ],
+          "leftId" : 5968,
+          "rightId" : 5968,
+          "cost" : 3857 }
+    ],
+    "pathRewritePlugin" : [
+        { "class" : "sudachipy.plugin.path_rewrite.JoinNumericPlugin",
+          "enableNormalize" : true },
+        { "class" : "sudachipy.plugin.path_rewrite.JoinKatakanaOovPlugin",
+          "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
+          "minLength": 3 }
+    ]
+}