#!/usr/bin/env python3 # Add location to the doc mapping produced by nutch import logging # import sys import os import json # import datetime from urllib.parse import urlparse from elasticsearch import Elasticsearch # from dateutil.parser import parse as parse_date def write_file(filename, content): # Open a file with open(filename, "a+") as fo: fo.seek(0) if not any(content == x.rstrip('\r\n') for x in fo): fo.write(content + '\n') # fo.close() print("write_file: content not present, writing.") else: print("write_file: content present, next.") def convert_year(date): d, m, y = date.split('/') return "/".join(['%d%s' % (19 if int(y) >= 50 else 20, y), m, d]) def write_json_file(workspace, file, data): file = workspace + file print(file) print(data) with open(file, "w") as jsonFile: json_contents = json.dumps(data, sort_keys=True, indent=4) print(json_contents) jsonFile.write(json_contents) def documents(workspace, fileList): print("Entering documents function") print("length of fileList: " + str(len(fileList))) # for file in fileList: for index, file in enumerate(fileList): print(index, '--->', file) # For each file in the list, read its json and provide a generator contents = json.load(open(workspace + file)) # print(contents) yield file, contents def print_hits(results, facet_masks={}): " Simple utility function to print results of a search query. " # print(results) # print('=' * 80) print('Total %d found in %dms' % (results['hits']['total'], results['took'])) # if results['hits']['hits']: # print('-' * 80) for hit in results['hits']['hits']: print(hit['_score']) print(hit['_source']['host']) # get created date for a repo and fallback to authored_date for a commit # created_at = parse_date(hit['_source'].get('tstamp', hit['_source']['lastModified'])) # print('/%s/%s/%s (%s): %s' % ( # hit['_index'], hit['_type'], hit['_id'], # created_at.strftime('%Y-%m-%d'), # hit['_source']['title'].replace('\n', ' '))) for facet, mask in facet_masks.items(): print('-' * 80) for d in results['facets'][facet]['terms']: print(mask % d) print('=' * 80) print() def search_es(): # Using the normal 'for each' loop get the domains from the "All good" section. # Call a function that searches ES for the docs with the same web/url and for each # returned add the location-LatLon. If they were added by nutch then we wouldn't # have to change the mapping. def get_domain(office_workspace, office_fileList): for file, data in documents(office_workspace, office_fileList): tmp = data["location"] # print("Office fileList: ", office_fileList) # print("Office workspace: ", office_workspace) print("File: ", file) # print(type(tmp)) # print(len(tmp)) Name = data["name"] Website = data["web"] latitude = data["location"]["lat"] longitude = data["location"]["lon"] print("lon: ", data["location"]["lon"]) print("lat: ", data["location"]["lat"]) parsed_uri = urlparse(Website) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) print("domain: ", domain) geo_content = (domain + ' ' + latitude + ' ' + longitude) geo_content_named = (Name + ' ' + domain + ' ' + latitude + ' ' + longitude) if (latitude == 0 and longitude == 0) and domain == ":///": print("Missing both") # write_file(firm_missing_both_fo, geo_content_named) elif (latitude == '0' and longitude == '0'): print("Missing geo") # write_file(geodata_missing_geo_fo, geo_content_named) elif domain == ":///": print("Missing website") # write_file(geodata_missing_web_fo, geo_content_named) else: print("All good") # write_file(geodata_fo, geo_content) # write_file(geodata_named_fo, geo_content_named) yield domain # get trace logger and set level tracer = logging.getLogger('elasticsearch.trace') tracer.setLevel(logging.INFO) tracer.addHandler(logging.FileHandler('/tmp/es_trace.log')) # workspace_home = '/var/tmp/lawsoc_new/' workspace_home = os.getcwd() workspace = workspace_home + "/json_out/" office_workspace = workspace + "office/" geodata_fo = workspace_home + "geodata.txt" geodata_named_fo = workspace_home + "geodata-named.txt" geodata_missing_both_fo = workspace_home + "geodata_missing_both.txt" geodata_missing_web_fo = workspace_home + "geodata_missing_website.txt" geodata_missing_geo_fo = workspace_home + "geodata_missing_geo.txt" office_fileList = os.listdir(office_workspace) document_index = "index" if __name__ == '__main__': # Get a list of files in the workspace... es = Elasticsearch() # print("Empty search: ") # print_hits(es.search(index='index')) # for domain in get_domain(office_workspace, office_fileList): result = es.search( index='index', doc_type='doc', body={ 'query': { 'filtered': { 'query': { 'match': {'url': 'http://www.renshawderrick.co.uk/'} }, 'filter': { 'not': { 'term': {'files': 'test_elasticsearch'} } } } } } ) print_hits(result)