99 lines
3.5 KiB
Python
99 lines
3.5 KiB
Python
#!/usr/bin/python3
|
|
|
|
# For each of the office json files extract the url domain and output with lat
|
|
# lon to geodata.txt
|
|
|
|
import os
|
|
import json
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
def write_file(filename, content):
|
|
# Open a file
|
|
with open(filename, "a+") as fo:
|
|
fo.seek(0)
|
|
if not any(content == x.rstrip('\r\n') for x in fo):
|
|
fo.write(content + '\n')
|
|
# fo.close()
|
|
print("write_file: content not present, writing.")
|
|
else:
|
|
print("write_file: content present, next.")
|
|
|
|
|
|
def convert_year(date):
|
|
d, m, y = date.split('/')
|
|
return "/".join(['%d%s' % (19 if int(y) >= 50 else 20, y), m, d])
|
|
|
|
|
|
def write_json_file(workspace, file, data):
|
|
file = workspace + file
|
|
print(file)
|
|
print(data)
|
|
with open(file, "w") as jsonFile:
|
|
json_contents = json.dumps(data, sort_keys=True, indent=4)
|
|
print(json_contents)
|
|
jsonFile.write(json_contents)
|
|
|
|
|
|
def documents(workspace, fileList):
|
|
print("Entering documents function")
|
|
print("length of fileList: " + str(len(fileList)))
|
|
# for file in fileList:
|
|
for index, file in enumerate(fileList):
|
|
print(index, '--->', file)
|
|
# For each file in the list, read its json and provide a generator
|
|
contents = json.load(open(workspace + file))
|
|
# print(contents)
|
|
yield file, contents
|
|
|
|
|
|
# workspace_home = '/var/tmp/lawsoc_new/'
|
|
workspace_home = os.getcwd()
|
|
workspace = workspace_home + "/json_out/"
|
|
workspace_out = workspace_home + "/text_files/"
|
|
office_workspace = workspace + "office/"
|
|
geodata_fo = workspace_out + "geodata.txt"
|
|
geodata_named_fo = workspace_out + "geodata-named.txt"
|
|
geodata_missing_both_fo = workspace_out + "geodata_missing_both.txt"
|
|
geodata_missing_web_fo = workspace_out + "geodata_missing_website.txt"
|
|
geodata_missing_geo_fo = workspace_out + "geodata_missing_geo.txt"
|
|
|
|
if __name__ == '__main__':
|
|
# Get a list of files in the workspace...
|
|
office_fileList = os.listdir(office_workspace)
|
|
|
|
for file, data in documents(office_workspace, office_fileList):
|
|
tmp = data["location"]
|
|
# print("Office fileList: ", office_fileList)
|
|
# print("Office workspace: ", office_workspace)
|
|
print("File: ", file)
|
|
# print(type(tmp))
|
|
# print(len(tmp))
|
|
Name = data["name"]
|
|
Website = data["web"]
|
|
latitude = data["location"]["lat"]
|
|
longitude = data["location"]["lon"]
|
|
print("lon: ", data["location"]["lon"])
|
|
print("lat: ", data["location"]["lat"])
|
|
parsed_uri = urlparse(Website)
|
|
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
|
|
print("domain: ", domain)
|
|
geo_content = (domain + ' ' + latitude + ' ' + longitude)
|
|
geo_content_named = (Name + ' \
|
|
' + domain + ' \
|
|
' + latitude + ' \
|
|
' + longitude)
|
|
if (latitude == 0 and longitude == 0) and domain == ":///":
|
|
print("Missing both")
|
|
write_file(geodata_missing_both_fo, geo_content_named)
|
|
elif (latitude == '0' and longitude == '0'):
|
|
print("Missing geo")
|
|
write_file(geodata_missing_geo_fo, geo_content_named)
|
|
elif domain == ":///":
|
|
print("Missing website")
|
|
write_file(geodata_missing_web_fo, geo_content_named)
|
|
else:
|
|
print("All good")
|
|
write_file(geodata_fo, geo_content)
|
|
write_file(geodata_named_fo, geo_content_named)
|