#!/usr/bin/python3

"""
XLSX spreadsheet converter
Opens a source XLSX workbook and saves the data in a csv file.
See cases 8299, 12536, 12537, 12634 for history
See Phabricator project/tag "xlsx2csv" for current work
"""

import os
import sys
import xlrd # For .xls files
import openpyxl # For .xlsx files
import csv
import datetime
import logging
import re
import codecs

def buildDBName(fileName, sheetName):
	fRoot, fNameRoot = os.path.split(fileName)
	fName, fExt = os.path.splitext(fNameRoot)
	
	#logging.info('Sheetname: \''+sheetName+'\'')
	
	unnamedSheet = re.compile(r'^Sheet[\d+]$')
	invalidSheetname = re.compile(r'^COM[\d*]$')
	if(unnamedSheet.match(sheetName)):
		return fName + '_' + sheetName[5:] + '.csv'
	elif invalidSheetname.match(sheetName):
		logging.error('Sheet '+sheetName+' has an invalid name. Prefixing with \'TABLENAME-\'...')
		return 'TABLENAME-'+sheetName+'.csv'
	else:
		return sheetName + '.csv'
	
def convertFile(f, folder):
	convertedDBs = 0

	floatRound = re.compile(r'^[0-9]+\.0$')
	#logging.debug("xlrd opening workbook")
	# Assuming file extensions are the Word of God like any good Windows user
	ex = os.path.splitext(f)[-1].lower()
	if ex == ".xls":
		wb = xlrd.open_workbook(f)

		for sheetName in wb.sheet_names():
			sheet = wb.sheet_by_name(sheetName)
			#logging.debug(sheetName + "[" + str(sheet.nrows) + "," + str(sheet.ncols) + "]")

			# Read the data from each sheet into sheetData
			sheetData = []
			for row in range(0, sheet.nrows):
				rowData = []
				for cell in range(0, sheet.ncols):
					c = sheet.cell(row, cell)
					# First check for XL_CELL_ERROR and append nothing if so and complain about it; see http://xlrd.readthedocs.io/en/latest/api.html#xlrd.sheet.Cell
					if c.ctype == 5: # error
						logging.error("Error on cell: "+xlrd.error_text_from_code[c.value])
						rowData.append("")
					# Then for some other types try and handle them
					elif c.ctype == 3: # date
						try:
							y, m, d, hour, min, sec = xlrd.xldate_as_tuple(c.value, wb.datemode)
							x = datetime.datetime(y, m, d, hour, min, sec)
							rowData.append(x.strftime('%Y-%m-%d %H:%M:%S'))
						except Exception:
							rowData.append("Invalid Date")
					elif c.ctype == 2: # float
						val = str(c.value)
						if floatRound.match(val):
							val = val[:-2]
						rowData.append(val)
					else:
						v = c.value
						if row == 0:
							try:
								v = v.upper()
							except Exception:
								logging.error("Attempted to uppercase cell under assumption that it was a column header, but failed!")
						rowData.append(v)
				# If every cell is empty, skip this row; see T1934. Maybe better done before but xlrd confuses me a bit sometimes tbqh so I took a shortcut.
				if all([not d.strip() for d in rowData]):
					continue
				sheetData.append(rowData)
			#logging.debug(sheetData[0:5])
			# CSV output file
			outFile = buildDBName(f, sheetName)
			if(not outFile):
				continue
			outFile = os.path.join(folder, outFile)
			#logging.debug(outFile)

			# Output the data to CSV
			if len(sheetData)>0:
				convertedDBs += 1
				logging.info(outFile+' contains '+str(len(sheetData))+' records.')
				# see https://docs.python.org/3/library/csv.html#id3 for "newline=''" rationale
				with open(outFile, "w", encoding="utf-16", newline='') as outFileHandle:
					writer = csv.writer(outFileHandle)
					writer.writerows(sheetData)

	else: # If it wasn't .xls, we're just assuming it's an XLSX file that openpyxl can handle
		logging.info("Using openpyxl for presumably-xlsx file")
		wb = openpyxl.load_workbook(f)

		for sheetName in wb.sheetnames:
			sheet = wb[sheetName]

			# Read the data from each sheet into sheetData
			sheetData = []
			for row in sheet.iter_rows():
				# If every cell is empty, skip this row; see T1934
				if all(cell.value is None for cell in row):
					logging.debug("Skipping blank row")
					continue
				logging.debug(row)
				rowData = []
				for cell in row:
					# Should maybe check for errors? Not doing so yet though.
					if isinstance(cell, openpyxl.cell.cell.MergedCell):
						logging.debug("Skipping MergedCell, a type of cell that is only for displays and has no actual value (in a literal sense)")
					elif cell.is_date:
						x = cell.value
						logging.debug(x)
						try:
							rowData.append(x.strftime('%Y-%m-%d %H:%M:%S'))
						except:
							logging.error(f"Row {cell.row} Column {cell.column} claims to be a date, but the value resisted date reformatting.")
					# There are other data types, but it wasn't immediately obvious to keith how to check for them
					elif cell.hyperlink:
						# Format using what I think is the Excel standard formatting for hyperlinks
						rowData.append('=HYPERLINK("{}", "{}")'.format(cell.hyperlink.target, cell.value))
					else:
						v = cell.value
						if type(v) is str:
							v = v.strip()
						rowData.append(v)
				sheetData.append(rowData)
			try:
				sheetData[0] = [v.upper() for v in sheetData[0]]
			except Exception:
				logging.error("Attempted to uppercase first row under assumption that it is a header row, but failed!")
			# CSV output file
			outFile = buildDBName(f, sheetName)
			if(not outFile):
				continue
			outFile = os.path.join(folder, outFile)
			#logging.debug(outFile)

			# Output the data to CSV
			if len(sheetData)>0:
				convertedDBs += 1
				logging.info(outFile+' contains '+str(len(sheetData))+' records.')
				# see https://docs.python.org/3/library/csv.html#id3 for "newline=''" rationale
				with open(outFile, "w", encoding="utf-16", newline='') as outFileHandle:
					writer = csv.writer(outFileHandle)
					writer.writerows(sheetData)

	
	return convertedDBs


def main():
	codepage = os.environ.get("DBDOC_CODEPAGE", "utf-8")
	sys.stdout = codecs.getwriter(codepage)(sys.stdout.detach())

	logFormatter = logging.Formatter("%(message)s")
	rootLogger = logging.getLogger()
	rootLogger.setLevel(logging.ERROR)
	fileHandler = logging.FileHandler("xlsx2csv.err", mode="w", encoding="utf-8")
	fileHandler.setFormatter(logFormatter)
	fileHandler.setLevel(logging.ERROR)
	rootLogger.addHandler(fileHandler)
	consoleHandler = logging.StreamHandler(sys.stdout)
	consoleHandler.setFormatter(logFormatter)
	consoleHandler.setLevel(logging.ERROR)
	rootLogger.addHandler(consoleHandler)

	helptxt = 'Usage: xlsx2csv <database> <folder for output [optional, default is current working directory]>'
	argc = len(sys.argv)
	if argc<2 or argc>3:
		logging.error('Invalid Arguments. '+helptxt)
		return -1
	if sys.argv[1] == "--help":
		logging.error(helptxt)
		return -1
	filename = sys.argv[1]
	folder = '.'
	if argc==3:
		folder = sys.argv[2]

	try:
		logging.info('Processing '+filename)
		n = convertFile(filename, folder)
		logging.info('Finished!')
		return n
	except Exception as e:
		logging.error('Converting Database Failed:')
		logging.error('\t'+str(e))
		return -1
	except KeyboardInterrupt:
		return -2
	
if __name__ == '__main__':
	main()
