(I updated the code with comments)
Hi everyone, I am creating an xml file with python using xml.etree.ElementTree. In input I have a docx, where the titles are formatted like "Header1" and the paragraphs under the title will be the text of the title (in short).
I need to format the xml properly. At the moment I have this code (I attach it)
One of the xml output I have is
https://1drv.ms/i/s!AohXx8uDzsTq8xsoHKat...T?e=omJVaQ
...here after "lines" elem. the text is not wrapping. Instead I want that "lines" is like a group of the string like is here
https://1drv.ms/i/s!AohXx8uDzsTq8xzcRknU...g?e=QvGaYD
I attached also a test file.
If you prefer, here is the code:
TEST.docx (Size: 13.12 KB / Downloads: 264)
xml3_commented.py (Size: 7.6 KB / Downloads: 248)
Hi everyone, I am creating an xml file with python using xml.etree.ElementTree. In input I have a docx, where the titles are formatted like "Header1" and the paragraphs under the title will be the text of the title (in short).
I need to format the xml properly. At the moment I have this code (I attach it)
One of the xml output I have is
https://1drv.ms/i/s!AohXx8uDzsTq8xsoHKat...T?e=omJVaQ
...here after "lines" elem. the text is not wrapping. Instead I want that "lines" is like a group of the string like is here
https://1drv.ms/i/s!AohXx8uDzsTq8xzcRknU...g?e=QvGaYD
I attached also a test file.
If you prefer, here is the code:
import os
import re
import xml.etree.ElementTree as ET
from docx import Document
def generate_xml(docx_file, output_folder):
# Read the docx file
document = Document(docx_file)
# Initialize variables
verse_num = 1
verse_type = 'V'
song_count = 0
lines_text = ''
verse_order_set = set()
verseOrder = None
lyrics = None
root = None
lines = None
# Loop through the paragraphs in the docx file
for i, para in enumerate(document.paragraphs):
# Check if the paragraph style is "Heading 1"
if para.style.name == 'Heading 1':
# Create the root "song" element and set its attributes
root = ET.Element("song")
root.set("xmlns", "http://openlyrics.info/namespace/2009/song")
root.set("version", "0.8")
root.set("createdIn", "OpenLP 2.4.3")
root.set("modifiedIn", "FreeWorship 3.2301.280.0")
root.set("modifiedDate", "2023-04-27T18:47:00")
# Create the child elements of the root element
properties = ET.SubElement(root, 'properties')
titles = ET.SubElement(properties, 'titles')
title = ET.SubElement(titles, 'title')
title.text = para.text
authors = ET.SubElement(properties, 'authors')
author = ET.SubElement(authors, 'author')
author.text="Author Unknown"
verseOrder = ET.SubElement(properties, 'verseOrder')
verseOrder.text = ''
songbooks = ET.SubElement(properties, 'songbooks')
songbook = ET.SubElement(songbooks, 'songbook')
songbook.set("name","Superbook")
songbook.set("entry","SuperBook")
lyrics = ET.SubElement(root, 'lyrics')
# Increment the song count and reset variables for the next song
song_count += 1
verse_order_set.clear()
verse_num = 1
bridge_num = 1
chorus_num = 1
else: # Check if the paragraph text is not empty
if para.text.strip():
first_word = para.text.split()[0]
# Check if the first word is a verse number
if (first_word[-1] == '.' or first_word[-1] == ')') and first_word[:-1].isdigit():
# Add the lines text to the "lines" element
if lines_text and lines is not None:
lines.text = lines_text[:-5]
lines_text = ''
# Increment the verse number and set the verse type to "V"
verse_num += 1
verse_type = 'V'
# Remove the verse number from the paragraph text
para.text = para.text[len(first_word):].strip()
# Set the verse name
verse_name = f'{verse_type}{verse_num}'
# Check if the first word is "Coro"
elif 'Coro' in first_word:
# Add the lines text to the "lines" element
if lines_text and lines is not None:
lines.text = lines_text[:-5]
lines_text = ''
# Set the verse type to "C"
verse_type = 'C'
# Remove the first word from the paragraph text
para.text = para.text[len(first_word):].strip()
# Set the verse name
verse_name = f'{verse_type}{chorus_num}'
# Increment the chorus number
chorus_num += 1
# Check if the first word is "Bridge:"
elif first_word == 'Bridge:':
# Add the lines text to the "lines" element
if lines_text and lines is not None:
lines.text = lines_text[:-5]
lines_text = ''
# Set the verse type to "B"
verse_type = 'B'
# Remove the first word from the paragraph text
para.text = para.text[len(first_word):].strip()
# Set the verse name
verse_name = f'{verse_type}{bridge_num}'
else:
# Set the verse type to "V"
verse_type = 'V'
# Set the verse name
verse_name = f'{verse_type}{verse_num}'
# Check if the verse name is not in the verse order set
if verse_name not in verse_order_set:
# Add the verse name to the "verseOrder" element text
if verseOrder is not None:
if not verseOrder.text:
verseOrder.text += f'{verse_name}'
else:
verseOrder.text += f'{verse_name}'
# Add the verse name to the verse order set
verse_order_set.add(verse_name)
# Check if there is no lines text yet
if not lines_text and lyrics is not None:
# Create the "verse" and "lines" elements
verse = ET.SubElement(lyrics, 'verse', attrib={'name': verse_name, 'lang': ''})
lines= ET.SubElement(verse, 'lines')
# Initialize a list to store the text runs
text_runs = []
# Loop through the runs in the paragraph
for run in para.runs:
# Check if the run font color is None or black
if run.font.color.rgb is None or run.font.color.rgb == "000000":
# Append the run text to the text runs list
text_runs.append(run.text)
else:
# Split the run text into words and add brackets around each word
words = run.text.split()
bracketed_words = ['[' + word + ']' for word in words]
# Join the bracketed words and append them to the text runs list
text_runs.append(' '.join(bracketed_words))
# Join the text runs and add line breaks
para_text = ''.join(text_runs).replace('\n','<br/>') +'<br/>'
# Add the paragraph text to the lines text
lines_text += para_text
# Check if this is the last paragraph or if the next paragraph style is "Heading 1"
if i == len(document.paragraphs) - 1 or document.paragraphs[i + 1].style.name == 'Heading 1':
# Check if there is a root element
if root is not None:
# Add the lines text to the "lines" element
if lines_text and lyrics is not None and lines is not None:
lines.text = lines_text[:-5]
lines_text = ''
# Create an ElementTree object and write it to an XML file
tree = ET.ElementTree(root)
filename = re.sub(r'[^\w\s-]', '', title.text).strip().lower()
# filename = re.sub(r'[-\s]+', '-', filename)
output_file = os.path.join(output_folder, f'{filename}.xml')
tree.write(output_file, encoding='utf-8', xml_declaration=True)
# Call the generate_xml function with a docx file and an output folder as arguments
generate_xml("C:/Users/Daniele/Downloads/test.docx","C:/Users/Daniele/Downloads/xml")
TEST.docx (Size: 13.12 KB / Downloads: 264)
Attached Files
