lawsoc-scraper/elasticsearch/query_and_add.py

#!/usr/bin/env python3

# Add location to the doc mapping produced by nutch

import logging
# import sys
import os
import json
# import datetime
from urllib.parse import urlparse
from elasticsearch import Elasticsearch
# from dateutil.parser import parse as parse_date


def write_file(filename, content):
    # Open a file
    with open(filename, "a+") as fo:
        fo.seek(0)
        if not any(content == x.rstrip('\r\n') for x in fo):
            fo.write(content + '\n')
#            fo.close()
            print("write_file: content not present, writing.")
        else:
            print("write_file: content present, next.")


def convert_year(date):
    d, m, y = date.split('/')
    return "/".join(['%d%s' % (19 if int(y) >= 50 else 20, y), m, d])


def write_json_file(workspace, file, data):
    file = workspace + file
    print(file)
    print(data)
    with open(file, "w") as jsonFile:
        json_contents = json.dumps(data, sort_keys=True, indent=4)
        print(json_contents)
        jsonFile.write(json_contents)


def documents(workspace, fileList):
    print("Entering documents function")
    print("length of fileList: " + str(len(fileList)))
    # for file in fileList:
    for index, file in enumerate(fileList):
        print(index, '--->', file)
        # For each file in the list, read its json and provide a generator
        contents = json.load(open(workspace + file))
        # print(contents)
        yield file, contents


def print_hits(results, facet_masks={}):
    " Simple utility function to print results of a search query. "
    # print(results)
    # print('=' * 80)
    print('Total %d found in %dms' % (results['hits']['total'], results['took']))
    # if results['hits']['hits']:
    #    print('-' * 80)
    for hit in results['hits']['hits']:
        print(hit['_score'])
        print(hit['_source']['host'])
        # get created date for a repo and fallback to authored_date for a commit
        # created_at = parse_date(hit['_source'].get('tstamp', hit['_source']['lastModified']))
        # print('/%s/%s/%s (%s): %s' % (
        #        hit['_index'], hit['_type'], hit['_id'],
        #        created_at.strftime('%Y-%m-%d'),
        #        hit['_source']['title'].replace('\n', ' ')))

    for facet, mask in facet_masks.items():
        print('-' * 80)
        for d in results['facets'][facet]['terms']:
            print(mask % d)
    print('=' * 80)
    print()


def search_es():


# Using the normal 'for each' loop get the domains from the "All good" section.
# Call a function that searches ES for the docs with the same web/url and for each
# returned add the location-LatLon. If they were added by nutch then we wouldn't
# have to change the mapping.

    def get_domain(office_workspace, office_fileList):
        for file, data in documents(office_workspace, office_fileList):
            tmp = data["location"]
            # print("Office fileList: ", office_fileList)
            # print("Office workspace: ", office_workspace)
            print("File: ", file)
            # print(type(tmp))
            # print(len(tmp))
            Name = data["name"]
            Website = data["web"]
            latitude = data["location"]["lat"]
            longitude = data["location"]["lon"]
            print("lon: ", data["location"]["lon"])
            print("lat: ", data["location"]["lat"])
            parsed_uri = urlparse(Website)
            domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
            print("domain: ", domain)
            geo_content = (domain + ' ' + latitude + ' ' + longitude)
            geo_content_named = (Name + ' ' + domain + ' ' + latitude + ' ' + longitude)
            if (latitude == 0 and longitude == 0) and domain == ":///":
                print("Missing both")
                # write_file(firm_missing_both_fo, geo_content_named)
            elif (latitude == '0' and longitude == '0'):
                print("Missing geo")
                # write_file(geodata_missing_geo_fo, geo_content_named)
            elif domain == ":///":
                print("Missing website")
                # write_file(geodata_missing_web_fo, geo_content_named)
            else:
                print("All good")
                # write_file(geodata_fo, geo_content)
                # write_file(geodata_named_fo, geo_content_named)
                yield domain


# get trace logger and set level
tracer = logging.getLogger('elasticsearch.trace')
tracer.setLevel(logging.INFO)
tracer.addHandler(logging.FileHandler('/tmp/es_trace.log'))

# workspace_home = '/var/tmp/lawsoc_new/'
workspace_home = os.getcwd()
workspace = workspace_home + "/json_out/"
office_workspace = workspace + "office/"
geodata_fo = workspace_home + "geodata.txt"
geodata_named_fo = workspace_home + "geodata-named.txt"
geodata_missing_both_fo = workspace_home + "geodata_missing_both.txt"
geodata_missing_web_fo = workspace_home + "geodata_missing_website.txt"
geodata_missing_geo_fo = workspace_home + "geodata_missing_geo.txt"

office_fileList = os.listdir(office_workspace)
document_index = "index"

if __name__ == '__main__':
    # Get a list of files in the workspace...
    es = Elasticsearch()
    # print("Empty search: ")
    # print_hits(es.search(index='index'))
    # for domain in get_domain(office_workspace, office_fileList):
    result = es.search(
        index='index',
        doc_type='doc',
        body={
          'query': {
            'filtered': {
              'query': {
                'match': {'url': 'http://www.renshawderrick.co.uk/'}
              },
              'filter': {
                'not': {
                  'term': {'files': 'test_elasticsearch'}
                }
              }
            }
          }
        }
    )
    print_hits(result)