import copy
import re
import os
import tkinter as tk
import tkinter.filedialog as fd
from docx import Document
from docx.enum.text import WD_COLOR_INDEX


def find_occurances_in_paragraph(pattern, paragraph):
	res1 = []
	res2 = []
	for m in pattern.finditer(paragraph.text):
		res1.append(m.start())
		res2.append(m.end())
	#print(res1)
	#print(res2)
	return res1, res2


def get_target_runs(paragraph, start, end):

	i = 0
	i_start = 0
	sum_tmp = 0
	targets = []

	for index1, index2 in zip(start, end):

		past_start = False
		while(i < len(paragraph.runs)):
			
			for r in paragraph.runs[i_start:i]:
				sum_tmp = sum_tmp + len(r.text)

			run = paragraph.runs[i]
			i_start = i
			run_start = sum_tmp
			run_end   = run_start + len(run.text)
			
			run_contains_start = (run_start <= index1 <= run_end)
			run_contains_end   = (run_start <= index2 <= run_end)

			#Split run in three, take middle part
			if(run_contains_start and run_contains_end):
				if index2-run_end == 0:
					run_end = 0
				split_runs = split_run_in_three(paragraph, run, index1-run_start, index2-run_end)
				targets.append(split_runs[1])
				#print([r.text for r in targets])
				break
			#Split run, take second half
			elif(run_contains_start and not run_contains_end):
				past_start = True
				split_runs = split_run_in_two(paragraph, run, index1-run_start)
				#print([1, split_runs[1].text])
				targets.append(split_runs[1])
				i += 1 #skip run that was added by splitting run
			#Take whole run
			elif(past_start and not run_contains_end):
				#print([2, run.text])
				targets.append(run)
			#Split run, take first half
			elif(past_start and run_contains_end):
				split_runs = split_run_in_two(paragraph, run, index2-run_start)
				#print([3, split_runs[0].text])
				targets.append(split_runs[0])
				break
			i += 1

	return targets


def split_run_in_two(paragraph, run, split_index):

	index_in_paragraph = paragraph._p.index(run.element)

	text_before_split = run.text[0:split_index]
	text_after_split  = run.text[split_index:]
	
	run.text = text_before_split
	new_run = paragraph.add_run(text_after_split)
	copy_format_manual(run, new_run)
	paragraph._p[index_in_paragraph+1:index_in_paragraph+1] = [new_run.element]
	return [run, new_run]


def split_run_in_three(paragraph, run, split_start, split_end):
	#print(['three', split_start, split_end])
	first_split  = split_run_in_two(paragraph, run, split_end)
	second_split = split_run_in_two(paragraph, run, split_start)
	return second_split + [first_split[-1]]


def copy_format_manual(runA, runB):
	fontB = runB.font
	fontA = runA.font
	fontB.bold = fontA.bold
	fontB.italic = fontA.italic
	fontB.underline = fontA.underline
	fontB.strike = fontA.strike
	fontB.subscript = fontA.subscript
	fontB.superscript = fontA.superscript
	fontB.size = fontA.size
	fontB.highlight_color = fontA.highlight_color
	fontB.color.rgb = fontA.color.rgb


def ask_input_filename(msg = None, types = [('', '*.*')]):
	rt = tk.Tk()
	rt.withdraw()
	filename = fd.askopenfilename(title = msg, filetypes = types)
	rt.destroy()
	return filename


def ask_output_filename(filename, msg = None, types = [('', '*.*')], deftype = '.txt'):
	rt = tk.Tk()
	rt.withdraw()
	filename = fd.asksaveasfilename(initialfile = filename, title = msg, filetypes = types, defaultextension = deftype)
	rt.destroy()
	return filename


def main():
    #===========================================================================
    # ファイルを選択する
    #===========================================================================
    input_filename = ask_input_filename('入力データファイル', types = [('Wordファイル(*.docx)', '*.docx')])
    if input_filename == '':
    	print('キャンセルされました。何かキーを押してください。')
    	input()
    	exit()

    #===========================================================================
    # ファイルを開く
    #===========================================================================
    try:
    	doc = Document(input_filename)
    except:
    	print('エラーが起こりました。ファイルに問題があります。\n何かキーを押してください。')
    	input()
    	exit()
    
    #===========================================================================
    # 検索する正規表現と、ハイライトさせる色
    #===========================================================================
    searchs = ["[,.]", "[，．]", "[0-9]", "[０-９]", "[ ]", "[　]"]
    hlcolors = [WD_COLOR_INDEX.BRIGHT_GREEN, 
                WD_COLOR_INDEX.GREEN, 
	            WD_COLOR_INDEX.RED, 
	            WD_COLOR_INDEX.PINK, 
	            WD_COLOR_INDEX.BLUE, 
	            WD_COLOR_INDEX.TURQUOISE]
    #===========================================================================
    for i, search in enumerate(searchs):
    	format_func = lambda x:x.font.__setattr__('highlight_color', hlcolors[i])
    	pattern = re.compile(search)
    	for paragraph in doc.paragraphs:
    		start, end = find_occurances_in_paragraph(pattern, paragraph)
    		runs = get_target_runs(paragraph, start, end)
    		for run in runs:
    			format_func(run)

    #===========================================================================
    # ハイライトしたファイルを保存する
    #===========================================================================
    initial_filename = os.path.basename(input_filename)
    initial_filename = initial_filename[:-5] + '（ハイライト）.docx'
    output_filename = ask_output_filename(initial_filename, msg = '出力ファイル', types = [('テキスト (*.docx)', '*.docx')])
    # キャンセルされた場合
    if output_filename == '':
    	print('キャンセルされました。何かキーを押してください。')
    	input()
    	exit()

    doc.save(output_filename)


if __name__ == "__main__":
    main()

