title: 多叉树treelib&区间树intervaltree
subtitle: 多叉树相关以及区间树三方库介绍
date: 2021-03-04
author: NSX
catalog: true
tags:
- 多叉树treelib
- 区间树intervaltree
- treelib 库是一个 Python 的第三方库。这个库实现了一些多叉树相关的常用方法。
- intervaltree库是一个可变的、自平衡的区间树。查询可以按点、按范围重叠或按范围包含。
treelib 库
用于构建多叉树
#创建一棵多叉树#show(): 将多叉树按树形结构展示输出from treelib import Tree, Nodetree = Tree()tree.show()print(tree.identifier)#添加节点到多叉树中tree.create_node(tag='Node-5', identifier='node-5', data=5)tree.create_node(tag='Node-10', identifier='node-10', parent='node-5', data=10)tree.create_node('Node-15', 'node-15', 'node-10', 15)tree.show()node = Node(tag='Node-A', identifier='node-A', data='A')tree.add_node(node, parent='node-5')tree.show()"""Node-5└── Node-10└── Node-15Node-5├── Node-10│ └── Node-15├── Node-A"""#节点的属性和方法print(node)print('node id: ', node.identifier)print('node tag:', node.tag)print('node data:', node.data)print('node is leaf: ', node.is_leaf())print('node is root: ', node.is_root())#多叉树中的节点个数print('tree len: ', len(tree))print('tree size:', tree.size())#多叉树的深度和叶子节点print('tree depth:', tree.depth())print('node-20 depth:', tree.depth(node='node-20'))print('node-20 level:', tree.level('node-20'))print('tree leaves:', tree.leaves()) #返回多叉树的所有叶节点print(tree.paths_to_leaves()) #返回根节点到每个叶节点的路径上的所有节点id#返回多叉树中的节点print('tree nodes:', tree.nodes)print(tree.all_nodes())for node in tree.all_nodes_itr():print(node)#多叉树转换成字典和保存到文件中print(tree.to_dict())print(tree.to_json())tree.to_graphviz()tree.save2file('demo_tree.tree')
应用:通过多叉树解决嵌套实体的问题
class Nodex(object):def __init__(self, tmp):self.tmp = tmpdef cross_judge(item_a, item_b):"""判断实体之间是否存在交集"""a1, b1 = item_a["char_offset"], item_a["char_offset"] + item_a["char_length"]a2, b2 = item_b["char_offset"], item_b["char_offset"] + item_b["char_length"]if max(a1, a2) < min(b1, b2):return Trueelse:return Falsedef get_slot_dic_from_tree(result_tmp):"""通过多叉树输出嵌套实体(非包含关系)的排列组合结果result_tmp: 实体相关信息 ↑"""tree1 = Tree()tree1.create_node("Root", "root", data=Nodex(-1)) # 根节点root2leave = []if result_tmp:result_tmp.sort(key=lambda x: x["char_offset"])# 第一个new_node直接加入treetree1.create_node("Child0", "child0", parent="root", data=Nodex(result_tmp[0]))for i in range(1, len(result_tmp)):parent_idt_list = []# 每个new_node与所有的叶子结点进行比较for j, leaf in enumerate(tree1.leaves()):idt = leaf.identifier # 叶子结点标识符parent_idt = tree1.parent(idt).identifier # 父结点标识符node_a = Node(tag="Child" + str(i) + "_" + str(j),identifier="Child" + str(i) + "_" + str(j),data=Nodex(result_tmp[i]),)# 1、如果存在嵌套(冲突),则父节点增加新分支if cross_judge(result_tmp[i], leaf.data.tmp):# 针对相同的new_node,父节点仅可增加一个新分支if parent_idt not in parent_idt_list:tree1.add_node(node_a, parent=parent_idt)parent_idt_list.append(parent_idt)# 2、否则,当前节点加深else:tree1.add_node(node_a, parent=idt)# 最后返回【根节点→叶子结点】的所有路径,即所有不存在嵌套实体的排列组合结果for path in tree1.paths_to_leaves():item = [tree1.get_node(idt).data.tmp for idt in path[1:]]root2leave.append(item)return root2leave
图示解决过程:
intervaltree库
判断多个区间是否有重叠:利用intervaltree构建区间树,也即构建一个二叉排序树,树的叶节点是每个区间。同时,树的查询效率和树的深度是有关系的,所以构建一个相对平衡的二叉排序树也能进一步提高效率,比如红黑树。算法导论上介绍了这种数据结构,节点是区间的红黑树——区间树。
>>> from intervaltree import Interval, IntervalTree>>> tree = IntervalTree()>>> iv = Interval(4, 7)>>> tree.add(iv)>>> target_iv = Interval(5, 8)>>> tree.overlaps(target_iv)True>>> target_iv_2 = Interval(7, 10)>>> tree.overlaps(target_iv_2)False
应用:如“今天8点到10点, 8个人”提取时间和人数时,idx=text.find(x)只能查找第一个时间“8”出现位置的索引,导致人数“8”的位置被覆盖了。解决:在发现实体区间有重叠时,循环执行 idx=text.find(x, idx+1),直到-1跳出;
# 寻找最适合的索引interval_tree = IntervalTree() # 用于判断实体区间的重叠情况!char_offset = text.find(x.upper()) # firstwhile char_offset!=-1:target_iv = Interval(char_offset, char_offset+len(x))is_overlaps = bool(interval_tree.overlaps(target_iv))if is_overlaps: # 有重叠,则继续找,直到-1跳出char_offset = text.find(x.upper(), char_offset+1)else: # 无重叠,直接跳出interval_tree.add(target_iv)breakif char_offset==-1:char_offset = text.find(x.upper()) # first
