mirror of
https://github.com/taroved/pol
synced 2025-05-21 00:20:22 -07:00
first version of selection calculation is ready
This commit is contained in:
parent
c7af38f8e3
commit
24816edb33
@ -1,87 +1,128 @@
|
|||||||
|
I_TAGNAME = 0
|
||||||
|
I_ATTRS = 1
|
||||||
|
I_CHILDREN = 2
|
||||||
|
I_PARENT = 3 # not in use
|
||||||
|
|
||||||
def build_xpathes(item_tag_ids, html_json):
|
def build_xpathes(item_tag_ids, html_json):
|
||||||
shared_tag_stack = [];
|
shared_tag_stack = [];
|
||||||
|
|
||||||
def build_parent_stack(html_json, tag_id):
|
def _build_parent_stack(html_json, tag_id):
|
||||||
tag_stack = []
|
tag_stack = []
|
||||||
|
|
||||||
def walk_by_tag(tag):
|
def walk_by_tag(tag):
|
||||||
if (tag[1]['tag-id'] == tag_id):
|
if (tag[I_ATTRS]['tag-id'] == tag_id):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
for subtag in tag[2]:
|
for subtag in tag[I_CHILDREN]:
|
||||||
if walk_by_tag(subtag):
|
if walk_by_tag(subtag):
|
||||||
tag_stack.append(subtag)
|
tag_stack.append(subtag)
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
walk_by_tag(html_json)
|
walk_by_tag(html_json)
|
||||||
|
|
||||||
|
tag_stack.append(html_json)
|
||||||
|
|
||||||
return list(reversed(tag_stack))
|
return list(reversed(tag_stack))
|
||||||
|
|
||||||
def find_tags_by_tag_names(html_json, parent_tag_names):
|
def _find_tags_by_tag_names(html_json, parent_tag_names):
|
||||||
|
tags = []
|
||||||
|
tag_stack = []
|
||||||
|
|
||||||
tag_ids = []
|
def walk_by_tag(tag, depth):
|
||||||
|
tag_stack.append(tag)
|
||||||
def walk_by_tag(tag):
|
if tag[I_TAGNAME] == parent_tag_names[depth]:
|
||||||
depth = len(tag_ids)
|
|
||||||
if tag[0] == parent_tag_names[depth]:
|
|
||||||
if depth == len(parent_tag_names)-1: # is a tie
|
if depth == len(parent_tag_names)-1: # is a tie
|
||||||
tag_ids.append(tag[1]['tag-id'])
|
tags.append((tag, list(tag_stack)))
|
||||||
elif depth < len(parent_tag_names):
|
elif depth < len(parent_tag_names)-1:
|
||||||
for subtag in tag[2]:
|
for subtag in tag[I_CHILDREN]:
|
||||||
walk_by_tag(subtag)
|
walk_by_tag(subtag, depth+1)
|
||||||
return tag_ids
|
tag_stack.pop()
|
||||||
|
walk_by_tag(html_json, 0)
|
||||||
|
return tags
|
||||||
|
|
||||||
# allusion to xpath
|
# allusion to xpath
|
||||||
class PathItem:
|
class PathItem:
|
||||||
go_parent = False
|
go_parent = False
|
||||||
go_child_tag = None
|
child_tag = None
|
||||||
child_index = None
|
child_index = None
|
||||||
|
|
||||||
|
def __init__(self, go_parent=False, child_tag=None, child_index=None):
|
||||||
|
self.go_parent = go_parent
|
||||||
|
self.child_tag = child_tag
|
||||||
|
self.child_index = child_index
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '..' if self.go_parent else '%s[%s]' % (self.child_tag, self.child_index+1)
|
||||||
|
|
||||||
|
|
||||||
def _build_path(stack, target_stack):
|
def _build_path(stack, target_stack):
|
||||||
fork = None
|
fork = None
|
||||||
for fork_i in xrange(0, len(stack)):
|
for fork_i in xrange(0, len(stack)):
|
||||||
if stack[fork_i] == target_stack[fork_i]:
|
if stack[fork_i] == target_stack[fork_i]:
|
||||||
fork = stack[fork_i]
|
fork = stack[fork_i]
|
||||||
else:
|
else:
|
||||||
|
fork_i -= 1
|
||||||
break
|
break
|
||||||
|
|
||||||
path = []
|
path = []
|
||||||
|
# shifts to parent; like '..' in xpath
|
||||||
for i in xrange(fork_i, len(stack)):
|
for i in xrange(fork_i, len(stack)):
|
||||||
path.append(PathItem(go_parent=True))
|
path.append(PathItem(go_parent=True))
|
||||||
|
|
||||||
for i in xrange(fork_i, len(target_stack)-1):
|
# address by children with indexes; like 'tag[n]' in xpath
|
||||||
|
for i in xrange(fork_i, len(target_stack)):
|
||||||
tag = target_stack[i]
|
tag = target_stack[i]
|
||||||
tag_name = tag[0]
|
tag_name = tag[I_TAGNAME]
|
||||||
parent = target_stack[i-1]
|
parent = target_stack[i-1]
|
||||||
children = parent[2]
|
tags = parent[I_CHILDREN]
|
||||||
idx = 0
|
idx = 0
|
||||||
for j in xrange(0, len(children)):
|
for tag_ in tags:
|
||||||
if children[j][0] == tag_name:
|
if tag_[I_TAGNAME] == tag_name:
|
||||||
idx += 1
|
if tag_ == tag:
|
||||||
if children[j] == tag:
|
|
||||||
break
|
break
|
||||||
path.append(PathItem(go_child_tag=tag_name, child_index=idx))
|
idx += 1
|
||||||
|
path.append(PathItem(child_tag=tag_name, child_index=idx))
|
||||||
|
|
||||||
return path
|
return path
|
||||||
|
|
||||||
|
def _find_tag(html_json, source_tag_info, path):
|
||||||
|
tag = source_tag_info[0]
|
||||||
|
tag_stack = source_tag_info[1]
|
||||||
|
stack_i = len(tag_stack)-1
|
||||||
|
|
||||||
|
for step in path:
|
||||||
|
if step.go_parent:
|
||||||
|
stack_i -= 1
|
||||||
|
tag = tag_stack[stack_i]
|
||||||
|
else:
|
||||||
|
idx = step.child_index
|
||||||
|
next = None
|
||||||
|
for child in tag[I_CHILDREN]:
|
||||||
|
if child[I_TAGNAME] == step.child_tag:
|
||||||
|
if idx == 0:
|
||||||
|
next = child
|
||||||
|
break
|
||||||
|
idx -= 1
|
||||||
|
if next is None:
|
||||||
|
return None
|
||||||
|
tag = next
|
||||||
|
return tag
|
||||||
|
|
||||||
def get_selection_tag_ids(item_tag_ids, html_json):
|
def get_selection_tag_ids(item_tag_ids, html_json):
|
||||||
parent_stacks = {}
|
parent_stacks = {}
|
||||||
|
|
||||||
import pdb; pdb.set_trace()
|
|
||||||
# buld parent stacks for every item name
|
# buld parent stacks for every item name
|
||||||
for name in item_tag_ids:
|
for name in item_tag_ids:
|
||||||
tag_id = item_tag_ids[name]
|
tag_id = item_tag_ids[name]
|
||||||
parent_stacks[name] = build_parent_stack(html_json, tag_id)
|
parent_stacks[name] = _build_parent_stack(html_json, tag_id)
|
||||||
|
|
||||||
# get first item and get his path
|
# get first item and get his path
|
||||||
first_name, parent_stack = parent_stacks.popitem()
|
first_name, parent_stack = parent_stacks.popitem()
|
||||||
parent_tag_names = [tag[0] for tag in parent_stack]
|
parent_tag_names = [tag[I_TAGNAME] for tag in parent_stack]
|
||||||
|
|
||||||
# find tags for first item
|
# find tags for first item
|
||||||
tags = find_tags_by_tag_names(html_json, parent_tag_names)
|
tags = _find_tags_by_tag_names(html_json, parent_tag_names)
|
||||||
|
|
||||||
# get pathes for another items
|
# get pathes for another items
|
||||||
selection_pathes = {}
|
selection_pathes = {}
|
||||||
@ -89,21 +130,19 @@ def get_selection_tag_ids(item_tag_ids, html_json):
|
|||||||
selection_pathes[name] = _build_path(parent_stack, parent_stacks[name])
|
selection_pathes[name] = _build_path(parent_stack, parent_stacks[name])
|
||||||
|
|
||||||
# get selection ids
|
# get selection ids
|
||||||
selection_ids = [name:[] for name in item_tag_ids]
|
selection_ids = {name:[] for name in item_tag_ids}
|
||||||
for source_tag in tags:
|
for tag_info in tags:
|
||||||
ids = []
|
ids = {}
|
||||||
for name in selection_pathes:
|
for name in selection_pathes:
|
||||||
tag = _find_tag(html_json, source_tag, selection_path[name])
|
tag = _find_tag(html_json, tag_info, selection_pathes[name])
|
||||||
if tag is not None:
|
if tag is not None:
|
||||||
ids[name] = tag[T_ATTRS]['tag-id'])
|
ids[name] = tag[I_ATTRS]['tag-id']
|
||||||
else
|
else:
|
||||||
ids = None
|
ids = None
|
||||||
break
|
break
|
||||||
if ids is not None:
|
if ids is not None:
|
||||||
selection_ids[first_name].append(source_tag[T_ATTRS]['tag-id'])
|
selection_ids[first_name].append(tag_info[0][I_ATTRS]['tag-id'])
|
||||||
for name in selection_pathes:
|
for name in selection_pathes:
|
||||||
selection_ids[name].append(ids[name])
|
selection_ids[name].append(ids[name])
|
||||||
|
|
||||||
return selection_ids
|
return selection_ids
|
||||||
|
|
||||||
return { name: tag_ids }
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user