postgresql xid回卷预防及排查

发布时间 2023-09-28 17:10:44作者: 章怀柔

监控

WITH max_age AS ( 

   SELECT 2000000000 as max_old_xid

       , setting AS autovacuum_freeze_max_age 

       FROM pg_catalog.pg_settings 

       WHERE name = 'autovacuum_freeze_max_age' )

, per_database_stats AS ( 

   SELECT datname

       , m.max_old_xid::int

       , m.autovacuum_freeze_max_age::int

       , age(d.datfrozenxid) AS oldest_current_xid 

   FROM pg_catalog.pg_database d 

   JOIN max_age m ON (true) 

   WHERE d.datallowconn ) 

SELECT max(oldest_current_xid) AS oldest_current_xid

   , max(ROUND(100*(oldest_current_xid/max_old_xid::float))) AS percent_towards_wraparound

   , max(ROUND(100*(oldest_current_xid/autovacuum_freeze_max_age::float))) AS percent_towards_emergency_autovac

  

percent_towards_wraparound指标对于警报的设置非常重要。查询使用 age()
函数来确定 TXID 值,因此需要考虑它们是否真的处于耗尽状态,以确定回绕是否是一个真正存在的问题。如果耗尽,数据库将被迫关闭,并可能为了修复而导致停机时间的不确定。这个查询中存在一点缓冲,因为它检查的上限(确切地说是20亿)小于导致耗尽的实际最大整数值。但这已经足够接近了,达到100%的警报应该立即采取行动。

percent_towards_emergency_autovac指标是我们建议监测的附加值,特别是对于以前从未监测过此指标的系统(有关何时可以降低或删除此警报优先级,请参阅下面有关近期冻结成效的说明)。这将监视数据库的最高 TXID 值是否达到autovacuum_freeze_max_age。这是一个用户可调值,默认值为2亿,当任何表的最高 TXID 值达到该值时,该表上会出现更高优先级的autovacuum
。您将认识到这个特殊的vacuum
会话,因为在pg_stat_activity
中它将被标记(以防止回绕)。它的优先级更高,即使禁用autovacuum
,它也会运行,如果手动取消vacuum
,它几乎会立即重新启动。它还需要一些不同的内部低级锁,因此它可能会导致这些表上的争用稍微更高,这取决于它们在紧急vacuum
期间的使用方式。如果您确实遇到争用/锁的问题,并且是紧急vacuum
造成的,则完全可以安全地取消争用/锁,以允许其他事务完成。请注意,它将继续重新启动,直到vacuum
能够成功完成或手动运行vacuum

 

 

排查

# 查看每个库的年龄

SELECT datname, age(datfrozenxid) FROM pg_database;

SELECT c.oid::regclass
, age(c.relfrozenxid)
, pg_size_pretty(pg_total_relation_size(c.oid))
FROM pg_class c
JOIN pg_namespace n on c.relnamespace = n.oid
WHERE relkind IN ('r', 't', 'm')
AND n.nspname NOT IN ('pg_toast')
ORDER BY 2 DESC LIMIT 100;
# 1个库每个表的年龄排序 SELECT c.oid::regclass as table_name, greatest(age(c.relfrozenxid),age(t.relfrozenxid)) as age FROM pg_class c LEFT JOIN pg_class t ON c.reltoastrelid = t.oid WHERE c.relkind IN ('r', 'm') order by age desc;

# 查看1个表的年龄
select oid::regclass,age(relfrozenxid) from pg_class where oid='schema名称.表名称'::regclass::oid;
#这查询按照最老的XID排序,查看大于1G而且是排名前20的表:

select relname, age(relfrozenxid) as xid_age, pg_size_pretty(pg_table_size(oid)) as table_sizefrom pg_class where relkind = 'r' and pg_table_size(oid) > 1073741824order by xid_age desc limit 20;--vacuum前事务年龄为 61436    relname | xid_age | table_size----------------+---------+------------ test_tab | 31260 | 4327 MB

 



#通过以下语句可以查找出age年龄大于vacuum_freeze_table_age的表:
 select datname,age(datfrozenxid) from pg_database where datname not in ('postgres','template0','template1') and age(datfrozenxid)>(select setting::int from pg_settings where name='vacuum_freeze_table_age')order by age(datfrozenxid) desc;

  

运维脚本

# 对指定数据库中年龄最大的前 50 张表进行 vacuum freeze



for cmd in `psql -U用户名 -p端口号 -h连接串 -d数据库名 -c "SELECT 'vacuum freeze '||c.oid::regclass||';' as vacuum_cmd FROM pg_class c LEFT JOIN pg_class t ON c.reltoastrelid = t.oid WHERE c.relkind IN ('r', 'm') order by greatest(age(c.relfrozenxid),age(t.relfrozenxid)) desc offset 50 limit 50;" | grep -v vacuum_cmd  | grep -v row | grep vacuum`; do

  python脚本

from multiprocessing import Pool

import psycopg2



args = dict(host='pgm-bp10xxxx.pg.rds.aliyuncs.com', port=5432, dbname='数据库名',

            user='用户名', password='密码')



def vacuum_handler(sql):

    sql_str = "SELECT c.oid::regclass as table_name, greatest(age(c.relfrozenxid),age(t.relfrozenxid)) as age FROM pg_class c LEFT JOIN pg_class t ON c.reltoastrelid = t.oid WHERE c.relkind IN ('r', 'm') order by age desc limit 10; "

    try:

        conn = psycopg2.connect(**args)

        cur = conn.cursor()

        cur.execute(sql)

        conn.commit()

        cur = conn.cursor()

        cur.execute(sql_str)

        print cur.fetchall()

        conn.close()

    except Exception as e:

        print str(e)



# 对指定数据库中年龄最大的前 1000 张表进行 vacuum freeze,32 个进程并发执行

def multi_vacuum():

    pool = Pool(processes=32)

    sql_str = "SELECT 'vacuum freeze '||c.oid::regclass||';' as vacuum_cmd FROM pg_class c LEFT JOIN pg_class t ON c.reltoastrelid = t.oid WHERE c.relkind IN ('r', 'm') order by greatest(age(c.relfrozenxid),age(t.relfrozenxid)) desc limit 1000;";

    try:

        conn = psycopg2.connect(**args)

        cur = conn.cursor()

        cur.execute(sql_str)

        rows = cur.fetchall()

        for row in rows:

            cmd = row['vacuum_cmd']

            pool.apply_async(vacuum_handler, (cmd, ))

        conn.close()

        pool.close()

        pool.join()

    except Exception as e:

        print str(e)





multi_vacuum()