94 lines
3.1 KiB
Python
Executable File
94 lines
3.1 KiB
Python
Executable File
#!/usr/bin/python3
|
|
#
|
|
# Copyright 2020-2021 The Khronos Group Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
# check_html_xrefs - simple-minded check for internal xrefs in spec HTML
|
|
# that don't exist.
|
|
|
|
# Usage: check_html_xrefs file
|
|
# Just reports bad xrefs, not where they occur
|
|
|
|
import argparse
|
|
import re
|
|
from lxml import etree
|
|
|
|
SECTNAME = re.compile(r'sect(?P<level>\d+)')
|
|
|
|
def find_parent_ids(elem, href):
|
|
"""Find section titles in parents, which are the 'id' elements of '<hN'
|
|
children of '<div class="sectM"' tags, and N = M + 1. This may be
|
|
specific to the Vulkan spec, though - hierarchy could be different in
|
|
other asciidoctor documents. Returns a list of [ anchor, title ].
|
|
|
|
elem - this node
|
|
href - href link text of elem"""
|
|
|
|
# Find parent <div> with class="sect#"
|
|
parent = elem.getparent()
|
|
while parent is not None:
|
|
if parent.tag == 'div':
|
|
cssclass = parent.get('class')
|
|
matches = SECTNAME.match(cssclass)
|
|
if matches is not None:
|
|
level = int(matches.group('level'))
|
|
# Look for corresponding header tag in this div
|
|
helem = parent.find('./h{}'.format(level+1))
|
|
if helem is not None:
|
|
return [ helem.get('id'), ''.join(helem.itertext()) ]
|
|
parent = parent.getparent()
|
|
return [ '** NO PARENT NODE IDENTIFIED **', '' ]
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('files', metavar='filename', nargs='*',
|
|
help='Path to registry XML')
|
|
args = parser.parse_args()
|
|
|
|
for filename in args.files:
|
|
parser = etree.HTMLParser()
|
|
tree = etree.parse(filename, parser)
|
|
|
|
# Find all 'id' elements
|
|
id_elems = tree.findall('.//*[@id]')
|
|
ids = set()
|
|
for elem in id_elems:
|
|
id = elem.get('id')
|
|
if id in ids:
|
|
True
|
|
# print('Duplicate ID attribute:', id)
|
|
else:
|
|
ids.add(id)
|
|
|
|
# Find all internal 'href' attributes and see if they're valid
|
|
# Keep an [element, href] list for tracking parents
|
|
# Also keep a count of each href
|
|
ref_elems = tree.findall('.//a[@href]')
|
|
refs = []
|
|
count = {}
|
|
for elem in ref_elems:
|
|
href = elem.get('href')
|
|
# If not a local href, skip it
|
|
if href[0] == '#':
|
|
# If there's a corresponding id, skip it
|
|
href = href[1:]
|
|
if href not in ids:
|
|
if href in count:
|
|
refs.append((elem, href))
|
|
True
|
|
count[href] = count[href] + 1
|
|
else:
|
|
refs.append((elem, href))
|
|
count[href] = 1
|
|
else:
|
|
True
|
|
# print('Skipping external href:', ref)
|
|
|
|
# Check for hrefs not found in ids
|
|
print('Bad links in {}:'.format(filename))
|
|
for (elem, href) in refs:
|
|
parents = find_parent_ids(elem, href)
|
|
print('{:<40} in {:<28} ({})'.format(href, parents[0], parents[1]))
|