mirror of
https://github.com/taroved/pol
synced 2025-05-18 07:10:09 -07:00
131 lines
3.7 KiB
Python
131 lines
3.7 KiB
Python
I_TAGNAME = 0
|
|
I_ATTRS = 1
|
|
I_CHILDREN = 2
|
|
I_PARENT = 3 # not in use
|
|
|
|
def build_xpathes(item_tag_ids, html_json):
|
|
shared_tag_stack = [];
|
|
|
|
def _get_fork_stack(stacks):
|
|
first = stacks.itervalues().next() # just first stack
|
|
for i in range(0, len(first)):
|
|
tag = first[i]
|
|
for name in stacks:
|
|
if tag != stacks[name][i]:
|
|
return stacks[name][:i]
|
|
return first
|
|
|
|
def _build_parent_stack(html_json, tag_id):
|
|
tag_stack = []
|
|
|
|
def walk_by_tag(tag):
|
|
if (tag[I_ATTRS]['tag-id'] == tag_id):
|
|
return True
|
|
else:
|
|
for subtag in tag[I_CHILDREN]:
|
|
if walk_by_tag(subtag):
|
|
tag_stack.append(subtag)
|
|
return True
|
|
return False
|
|
|
|
walk_by_tag(html_json)
|
|
|
|
tag_stack.append(html_json)
|
|
|
|
return list(reversed(tag_stack))
|
|
|
|
def _find_tags_by_tag_names(html_json, parent_tag_names):
|
|
tags = []
|
|
|
|
def walk_by_tag(tag, depth):
|
|
if tag[I_TAGNAME] == parent_tag_names[depth]:
|
|
if depth == len(parent_tag_names)-1: # is a tie
|
|
tags.append(tag)
|
|
elif depth < len(parent_tag_names)-1:
|
|
for subtag in tag[I_CHILDREN]:
|
|
walk_by_tag(subtag, depth+1)
|
|
walk_by_tag(html_json, 0)
|
|
return tags
|
|
|
|
# allusion to xpath
|
|
class PathItem:
|
|
go_parent = False
|
|
child_tag = None
|
|
child_index = None
|
|
|
|
def __init__(self, go_parent=False, child_tag=None, child_index=None):
|
|
self.go_parent = go_parent
|
|
self.child_tag = child_tag
|
|
self.child_index = child_index
|
|
|
|
def __repr__(self):
|
|
return '..' if self.go_parent else '%s[%s]' % (self.child_tag, self.child_index+1)
|
|
|
|
|
|
def _build_path(stack):
|
|
path = []
|
|
for i in range(0, len(stack)-1):
|
|
idx = 0
|
|
tag = stack[i]
|
|
search = stack[i+1]
|
|
for tag_ in tag[I_CHILDREN]:
|
|
if tag_[I_TAGNAME] == search[I_TAGNAME]:
|
|
if tag_ == search:
|
|
break
|
|
idx += 1
|
|
path.append(PathItem(child_tag=search[I_TAGNAME], child_index=idx))
|
|
return path
|
|
|
|
def _find_tag(html_json, tag, path):
|
|
for step in path:
|
|
idx = step.child_index
|
|
next = None
|
|
for child in tag[I_CHILDREN]:
|
|
if child[I_TAGNAME] == step.child_tag:
|
|
if idx == 0:
|
|
next = child
|
|
break
|
|
idx -= 1
|
|
if next is None:
|
|
return None
|
|
tag = next
|
|
return tag
|
|
|
|
def get_selection_tag_ids(item_tag_ids, html_json):
|
|
parent_stacks = {}
|
|
|
|
# buld parent stacks for every item name
|
|
for name in item_tag_ids:
|
|
tag_id = item_tag_ids[name]
|
|
parent_stacks[name] = _build_parent_stack(html_json, tag_id)
|
|
#import pdb; pdb.set_trace()
|
|
# get fork
|
|
fork_stack = _get_fork_stack(parent_stacks)
|
|
|
|
# get fork path
|
|
fork_path = [tag[I_TAGNAME] for tag in fork_stack]
|
|
|
|
# get pathes for items
|
|
fork_len = len(fork_path) - 1
|
|
selection_pathes = {name:_build_path(parent_stacks[name][fork_len:]) for name in parent_stacks}
|
|
|
|
# get fork tags
|
|
fork_tags = _find_tags_by_tag_names(html_json, fork_path)
|
|
|
|
# get selection ids
|
|
selection_ids = {name:[] for name in selection_pathes}
|
|
for fork_tag in fork_tags:
|
|
ids = {}
|
|
for name in selection_pathes:
|
|
tag = _find_tag(html_json, fork_tag, selection_pathes[name])
|
|
if tag is not None:
|
|
ids[name] = tag[I_ATTRS]['tag-id']
|
|
else:
|
|
ids = None
|
|
break
|
|
if ids is not None:
|
|
for name in selection_pathes:
|
|
selection_ids[name].append(ids[name])
|
|
|
|
return selection_ids
|