index可以合成使用
import faiss
import numpy as np d = 512 # 维数
n_data = 2000
np.random.seed(0)
data = []
mu = 3
sigma = 0.1
for i in range(n_data):data.append(al(mu, sigma, d))
data = np.array(data).astype('float32')# query
query = []
n_query = 10
np.random.seed(12)
query = []
for i in range(n_query):query.append(al(mu, sigma, d))
query = np.array(query).astype('float32')
乘积量化也可以作为粗量化器。
其中有两个参数:
c
m
这样每个划分都有c
个倒排表,一共有 c m c^m cm个倒排表,实际使用中,一般直接让 m = 2 m=2 m=2
MultiIndexQuantizer
也经常与IndexFlat
对比,以便选取合适的参数
nbits_mi = 5 # c
M_mi = 2 # m
coarse_quantizer_mi = faiss.MultiIndexQuantizer(d, M_mi, nbits_mi) # 不需要add任何数据
ncentroids_mi = 2 ** (M_mi * nbits_mi)index = faiss.IndexIVFFlat(coarse_quantizer_mi, d, ncentroids_mi)
index.quantizer_trains_alone = True # 表示这是粗量化器的flag
ain(data)
index.add(data)
index.nprobe = 50
dis, ind = index.search(query, 10)
print(dis)
print(ind)
[[8.61838 8.9166355 8.998905 9.01289 9.019705 9.188372 9.1920669.236515 9.236515 9.238319 ][9.164592 9.201885 9.344341 9.34485 9.416972 9.513818 9.51429.573903 9.605167 9.605826 ][8.211029 8.373353 8.459253 8.459894 8.498556 8.631897 8.6587038.697348 8.71368 8.735945 ][8.45299 8.513636 8.545483 8.597126 8.705633 8.7258835 8.75991258.761805 8.781776 8.80736 ][8.369623 8.760081 8.928711 8.93695 8.960407 9.022133 9.0351819.041813 9.088304 9.187949 ][8.299071 8.432397 8.457373 8.562357 8.579033 8.6983185 8.7941858.794858 8.79899 8.835644 ][8.860753 8.885756 8.922768 8.928856 8.9446945 8.959139 8.9723768.977709 9.020763 9.039915 ][8.763845 8.7686 8.846914 8.846914 8.9460125 8.97376 8.9760099.007911 9.009394 9.033215 ][8.488056 8.662771 8.701336 8.741288 8.8490505 8.857254 8.8937158.933592 8.933592 8.938933 ][8.684618 8.767246 8.903692 8.903692 8.917681 8.940119 8.9616669.108561 9.109709 9.123064 ]][[1269 1028 120 1267 1567 1070 1061 28 28 1972][1398 289 13 70 1023 1177 940 1568 700 604][ 345 389 1904 1992 1612 1623 1632 539 1143 366][1412 1624 879 394 835 1506 1398 91 440 1723][1666 94 1517 1723 1255 238 1755 472 375 1719][ 574 1523 766 91 456 154 296 444 1384 1230][1391 876 91 1914 78 969 732 999 1158 42][1662 1831 1654 1654 722 1070 121 1496 631 1442][ 154 31 1237 289 661 426 1008 1727 1727 744][ 375 1826 750 750 1430 459 1339 471 1554 441]]
使用汉明距离比使用乘积量化器要快6倍。然而,通过适当重新排序量化质心,PQ码之间的汉明距离与真正的距离相关。通过在汉明距离上施加阈值,可以避免大多数PQ码比较的时间开销。
index = faiss.IndexPQ(d, 16, 8)
# before train
index.do_polysemous_training = True
ain(data)index.add(data)
# before searching
index.search_type = faiss.IndexPQ.ST_polysemous
index.polysemous_ht = 54 # the Hamming thresholddis, ind = index.search(query, 10)
print(dis)
print(ind)
[[5.974182 6.10614 6.1224976 6.126343 6.204773 6.2459717 6.25250246.2615356 6.2651367 6.2732544][6.5718384 6.638489 6.6399536 6.6640625 6.6744385 6.6782837 6.7465216.751709 6.756714 6.769348 ][5.9680786 5.9692383 5.979309 6.0097046 6.039795 6.1034546 6.10870366.1325684 6.1377563 6.140991 ][5.486389 5.761841 5.781189 5.7894897 5.8067017 5.831421 5.83764655.840637 5.8604126 5.875305 ][5.8931885 6.109314 6.147461 6.1534424 6.185974 6.21875 6.22485356.243225 6.2452393 6.2542725][5.776123 6.0252686 6.0493774 6.0758057 6.093445 6.0980225 6.10681156.114258 6.139099 6.17218 ][6.023987 6.0289917 6.043213 6.04834 6.055298 6.091736 6.11395266.1154785 6.140625 6.1411133][6.0039062 6.060547 6.0739136 6.130188 6.179138 6.1831665 6.228766.249756 6.2651367 6.2764893][5.9506226 6.124695 6.152466 6.159851 6.164917 6.1794434 6.19519046.20166 6.215149 6.2680664][6.0117188 6.022705 6.185547 6.2302856 6.256653 6.270447 6.29577646.3084717 6.317688 6.328308 ]][[ 704 1201 660 1063 1156 507 1946 933 231 350][ 907 1444 1035 1570 1584 1998 52 23 853 922][1057 1855 1351 725 1001 502 1999 964 1179 533][1412 1528 1383 397 1987 1294 832 112 820 1583][1182 1666 89 529 1116 1723 743 667 1465 1257][ 766 798 1463 1472 436 1981 126 1995 460 681][ 539 123 1797 1950 169 348 1246 1955 1700 341][1397 1529 1573 188 579 954 1715 1617 1601 468][ 929 1438 335 571 1220 56 1116 139 1992 31][ 182 360 355 1359 118 1654 500 1351 191 237]]
对于IndexIVFPQ
# for IndexIVFPQ
coarse_quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(coarse_quantizer, d, ncentroids_mi, M_mi, 8)
# before training
index.do_polysemous_training = True
ain(data)index.add(data)# before searching
index.polysemous_ht = 54 # the Hamming thresholddis, ind = index.search(query, 10)
print(dis)
print(ind)
[[5.5944242 5.762803 5.81783 5.826597 5.8281116 5.853023 5.88833335.9103804 5.9335566 5.9981723][6.017277 6.0180216 6.1391654 6.1525793 6.1690145 6.285906 6.3677996.3924723 6.483139 6.5108876][5.1201916 5.155489 5.1658216 5.2309318 5.2639446 5.27475 5.2842585.3065944 5.332886 5.40935 ][5.0789447 5.1293364 5.129437 5.2446656 5.2639885 5.297005 5.30176075.332982 5.33683 5.3529315][5.5589275 5.579212 5.659722 5.6908636 5.7286787 5.7511234 5.76998765.7724476 5.7734365 5.8821025][5.2813945 5.3259473 5.353539 5.38878 5.3981133 5.4004183 5.42371565.4381495 5.4439116 5.4494076][5.431495 5.4494677 5.548911 5.555647 5.6319094 5.634611 5.6382895.6782656 5.7148957 5.720402 ][5.3285294 5.335212 5.4976454 5.5008645 5.525567 5.5554523 5.5556275.5836873 5.5910864 5.593465 ][5.4126625 5.4127774 5.491379 5.5269804 5.5783095 5.578781 5.6199395.622184 5.637552 5.666792 ][5.3548384 5.5728064 5.635675 5.6637754 5.703406 5.703953 5.7297175.740739 5.743409 5.77369 ]][[1483 695 1008 1115 993 869 236 31 1460 231][1166 1706 1665 882 1691 1596 1308 127 1646 784][ 655 224 276 1984 389 1793 324 707 1889 1363][ 973 200 112 902 178 806 283 1006 798 1348][ 561 280 653 1220 768 862 421 122 161 1626][1934 112 1006 1345 559 252 1337 786 1348 277][ 362 1016 1766 506 851 592 1601 311 1384 56][ 0 967 972 940 1129 60 932 1036 134 1566][1309 325 188 1685 534 1313 349 1831 1742 388][1345 1934 1348 283 559 1006 973 635 1403 200]]
要设置合理的阈值,请记住:
- 阈值(threshold)应介于0和每个代码的位数之间(在这种情况下为 128 = 16 ∗ 8 128=16*8 128=16∗8),并且代码遵循二项式分布
- 将阈值设置为每个代码位数的1/2将节省代码比较的1/2,这还不够。应将其设置为较低的值(因此对于128位代码的结果为54)。
IndexIVFPQR
在IndexIVFPQ
之上又增加了一个额外的量化量,与IndexRefineFlat
类似,它改进了IndexIVFPQ
计算的距离并基于这些重新排序结果。
本文发布于:2024-01-29 04:12:24,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170647274912608.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |