问题发生自一个用户的实时任务,任务大致逻辑是从源表读取数据,和ob维表关联,并将结果写入mysql表中,但是用户启动任务后发现,任务根本无法消费,整个进程卡住了,jstack查看后发现进程一直处于以下这样的堆栈中

  1. "Bolt-node-11:6" #51 prio=10 os_prio=0 tid=0x00002b1454002000 nid=0x676b in Object.wait() [0x00002b142cd06000]
  2. java.lang.Thread.State: RUNNABLE
  3. at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
  4. at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
  5. at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
  6. at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
  7. at java.lang.Class.newInstance(Class.java:442)
  8. at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:380)
  9. at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
  10. at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
  11. at java.sql.DriverManager$2.run(DriverManager.java:613)
  12. at java.sql.DriverManager$2.run(DriverManager.java:593)
  13. at java.security.AccessController.doPrivileged(Native Method)
  14. at java.sql.DriverManager.loadInitialDrivers(DriverManager.java:593)
  15. at java.sql.DriverManager.<clinit>(DriverManager.java:110)
  16. at com.mysql.jdbc.Driver.<clinit>(Driver.java:61)
  17. at java.lang.Class.forName0(Native Method)
  18. at java.lang.Class.forName(Class.java:264)
  19. ...
  20. "Spout-node-1:2" #44 prio=10 os_prio=0 tid=0x00002b1434009800 nid=0x6765 in Object.wait() [0x00002b142c700000]
  21. java.lang.Thread.State: RUNNABLE
  22. at com.aitozi.xxx.druid.proxy.DruidDriver.registerDriver(DruidDriver.java:92)
  23. at com.aitozi.xxx.druid.proxy.DruidDriver$1.run(DruidDriver.java:84)
  24. at java.security.AccessController.doPrivileged(Native Method)
  25. at com.aitozi.xxx.druid.proxy.DruidDriver.<clinit>(DruidDriver.java:81)
  26. at com.aitozi.xxx.druid.pool.DruidDataSource.init(DruidDataSource.java:563)
  27. at com.aitozi.xxx.datasource.ObServerDataSourceFactory.createDruidDataSource(ObServerDataSourceFactory.java:113)
  28. at com.aitozi.xxx.datasource.ObServerDataSourceFactory.createDruidHolder(ObServerDataSourceFactory.java:40)
  29. at com.aitozi.xxx.datasource.ObServerDataSourceInventory.createDataSourceHolder(ObServerDataSourceInventory.java:247)
  30. at com.aitozi.xxx.datasource.ObServerDataSourceInventory.getDataSource(ObServerDataSourceInventory.java:183)
  31. at com.aitozi.xxx.datasource.ObServerDataSourceInventory.getDataSource(ObServerDataSourceInventory.java:434)
  32. at com.aitozi.xxx.connection.ObGroupConnection.createNewConnection(ObGroupConnection.java:156)
  33. at com.aitozi.xxx.connection.ObGroupConnection.getServerConnetion(ObGroupConnection.java:214)
  34. at com.aitozi.xxx.connection.ObGroupConnection.getAvailConnection(ObGroupConnection.java:271)
  35. at com.aitozi.xxx.statement.ObStatement.getAvailConnection(ObStatement.java:1087)
  36. at com.aitozi.xxx.statement.ObStatement.doExecute(ObStatement.java:614)
  37. at com.aitozi.xxx.statement.ObStatement.execute(ObStatement.java:514)
  38. at com.aitozi.xxx.statement.ObStatement.execute(ObStatement.java:1224)
  39. at com.aitozi.xxx.statement.ObStatement.executeQuery(ObStatement.java:1179)
  40. at com.aitozi.xxx.resource.ObServerStateProcessor.refreshZoneState(ObServerStateProcessor.java:232)
  41. at com.aitozi.xxx.resource.ObServerStateProcessor.doRefreshTask(ObServerStateProcessor.java:133)
  42. at com.aitozi.xxx.resource.ObClusterResourcePool.createClusterResource(ObClusterResourcePool.java:112)
  43. at com.aitozi.xxx.resource.ObClusterResourcePool.syncCreateClusterResource(ObClusterResourcePool.java:66)
  44. at com.aitozi.xxx.datasource.ObGroupDataSource.createClusterResource(ObGroupDataSource.java:708)
  45. at com.aitozi.xxx.datasource.ObGroupDataSource.init(ObGroupDataSource.java:676)
  46. at com.aitozi.xxx.plugin.dim.AbstractOceanBaseDimStore.createDatasource(AbstractOceanBaseDimStore.java:333)
  47. at com.aitozi.xxx.plugin.dim.AbstractOceanBaseDimStore.lambda$connect$5(AbstractOceanBaseDimStore.java:300)
  48. at com.aitozi.xxx.plugin.dim.AbstractOceanBaseDimStore$$Lambda$64/1581344066.run(Unknown Source)
  49. at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
  50. at com.aitozi.xxx.utils.RetryCommand.run(RetryCommand.java:36)
  51. at com.aitozi.xxx.plugin.dim.AbstractOceanBaseDimStore.connect(AbstractOceanBaseDimStore.java:298)
  52. at com.aitozi.xxx.plugin.dim.AbstractOceanBaseDimStore.open(AbstractOceanBaseDimStore.java:114)
  53. at com.aitozi.xxx.dim.DimStoreManager.getKVDimStore(DimStoreManager.java:42)
  54. - locked <0x00000000ee8e2790> (a java.util.HashMap)
  55. at com.aitozi.xxx.runtime.rc.StreamingRuntimeContext.getKVDimStore(StreamingRuntimeContext.java:131)
  56. at com.aitozi.xxx.sql.runtime.runner.dim.OceanBaseDimTable.dimStore(OceanBaseDimTable.java:183)
  57. at com.aitozi.xxx.sql.runtime.runner.StreamJoinTableRunner$StreamJoinTableFn.open(StreamJoinTableRunner.java:243)
  58. at com.aitozi.xxx.api.function.util.FunctionUtils.openFunction(FunctionUtils.java:33)
  59. at com.aitozi.xxx.operator.base.AbstractOperator.setUp(AbstractOperator.java:66)
  60. at com.aitozi.xxx.operator.stream.AbstractStreamOperator.setUp(AbstractStreamOperator.java:58)
  61. at com.aitozi.xxx.operator.stream.StreamFlatMapOperator.setUp(StreamFlatMapOperator.java:26)
  62. at com.aitozi.xxx.runtime.stream.processor.AbstractSProcessor.operatorInit(AbstractSProcessor.java:115)
  63. at com.aitozi.xxx.runtime.stream.processor.AbstractSProcessor.operatorInit(AbstractSProcessor.java:119)
  64. at com.aitozi.xxx.runtime.stream.processor.AbstractSProcessor.operatorInit(AbstractSProcessor.java:119)
  65. at com.aitozi.xxx.runtime.stream.processor.AbstractSProcessor.open(AbstractSProcessor.java:109)
  66. at com.aitozi.xxx.runtime.stream.engine.storm.KeplerSpout.open(KeplerSpout.java:109)
  67. at com.aitozi.xxx.transactional.spout.ScheduleTransactionSpout.open(ScheduleTransactionSpout.java:47)
  68. at com.aitozi.xxx.task.execute.spout.SpoutExecutors.init(SpoutExecutors.java:108)
  69. at com.aitozi.xxx.task.execute.spout.MultipleThreadSpoutExecutors.init(MultipleThreadSpoutExecutors.java:52)
  70. at com.aitozi.xxx.task.execute.BaseExecutors.initWrapper(BaseExecutors.java:235)
  71. at com.aitozi.xxx.task.execute.BaseExecutors.run(BaseExecutors.java:262)
  72. at com.aitozi.xxx.callback.AsyncLoopRunnable.run(AsyncLoopRunnable.java:94)
  73. at java.lang.Thread.run(Thread.java:882)

这里只截取了两段有问题的堆栈,以上两个线程分别在执行MysqlWriter的初始化和Ob维表的初始化,我们后面简称为Mysql线程和Ob线程,

我们可以看到Mysql线程一直卡在了newInstance,对象创建上,和Ob线程并没有显示的锁竞争,那么为什么会导致进程卡死了呢?下面我们来简单分析下两个线程的执行流程

Mysql线程执行流程

  1. com.aitozi.xxx.plugins.output.mysql.ng.MySQLWriter#doConnect
  1. protected void doConnect() throws Exception {
  2. Class.forName("com.mysql.jdbc.Driver");
  3. connection = DriverManager.getConnection(url, user, password);
  4. ...
  5. }

这里我们可以看到是调用了Class.forName(), 显示的进行了mysql driver的初始化,继续track jdk内部方法,得到以下的调用链

  1. com.mysql.jdbc.Driver

    1. static {
    2. try {
    3. java.sql.DriverManager.registerDriver(new Driver());
    4. } catch (SQLException E) {
    5. throw new RuntimeException("Can't register driver!");
    6. }
    7. }

    mysql Driver的静态块初始化, 调用DriverManager静态方法注册driver,触发DriverManager初始化

  2. java.sql.DriverManager.

    1. /**
    2. * Load the initial JDBC drivers by checking the System property
    3. * jdbc.properties and then use the {@code ServiceLoader} mechanism
    4. */
    5. static {
    6. loadInitialDrivers();
    7. println("JDBC DriverManager initialized");
    8. }

这里是尝试load所有的driver,这里是通过spi的方式(一种java的服务发现机制),简单的说就是在classpath下的META-INF/services/目录里创建一个以服务接口命名的文件这里就是java.sql.Driver,这个文件里的内容就是这个接口的具体的实现类。当程序需要这个服务的时候,就可以通过查找META-INF/services/中的配置文件,配置文件中有接口的具体实现类名,可以根据这个类名进行加载实例化,就可以使用该服务了。线上的java.sql.Driver文件内容如下

  1. $ cat java.sql.Driver
  2. com.alibaba.druid.proxy.DruidDriver
  3. com.alibaba.druid.mock.MockDriver
  4. com.mysql.jdbc.Driver
  5. com.mysql.fabric.jdbc.FabricMySQLDriver
  6. com.alipay.oceanbase.obproxy.druid.proxy.DruidDriver
  7. com.alipay.oceanbase.obproxy.druid.mock.MockDriver
  8. org.apache.calcite.jdbc.Driver
  9. org.apache.calcite.avatica.remote.Driver
  10. oracle.jdbc.OracleDriver

因此这里会去按顺序尝试创建各个Driver实例,到这里Mysql线程block在了DruidDriver的实例创建上

Ob线程执行流程

  1. com.aitozi.xxx.datasource.ObGroupDataSource.init
  1. protected void createDatasource() throws Exception {
  2. if (obDataSource == null) {
  3. obDataSource = new ObGroupDataSource();
  4. ...
  5. obDataSource.setUrl(this.url);
  6. obDataSource.setFullUsername(this.fullUserName);
  7. obDataSource.setPassword(this.password);
  8. obDataSource.setDatabase(this.datebase);
  9. obDataSource.init();
  10. }
  11. }
  1. com.aitozi.xxx.druid.pool.DruidDataSource.init 进行DruidDataSource初始化
  2. com.aitozi.xxx.druid.proxy.DruidDriver. druidDriver的静态块初始化

    1. this.id = DruidDriver.createDataSourceId();
  3. 调用registerDriver开始进行注册,到这里ob线程也卡住了

    1. DriverManager.registerDriver(driver);

到这里可能还不太能看出两个线程block在哪里了,只是隐隐感觉都和Driver初始化有关,也都和DriverManager有关,下面的流程图能够一目了然的看出互相block在什么地方

多线程Driver初始化引起的死锁问题 - 图1 可以看到:
Mysql线程在等待Ob线程完成DruidDriverClass 加载
Ob线程在等待Mysql线程完成DriverManager Class加载

到这里谜底就揭开了,在堆栈信息中并没有显示的lock,是因为两个线程被对方的class初始化时的类锁给block住了

如何解决

其实这个问题发生的主要原因是因为DriverManager在类加载时会通过spi机制去扫描service类进行加载,一种方式是在创建DataSource或者显示加载driver的时候对DriverManager.class类加锁,或者在进程启动前暴力点直接先进行加载

  1. DriverManager.getDrivers();

附录中DBCP的一个issue提到的解法也是这个,我在代码中没有找到合适的位置直接进行加载,故采用了第一种解法去解决

复现case

  1. public class DriverManagerDeadLockTest {
  2. List<String> drivers = new ArrayList<>();
  3. @Before
  4. public void addDrivers() {
  5. drivers.add("com.mysql.jdbc.Driver");
  6. drivers.add("com.aitozi.xxx.druid.proxy.DruidDriver");
  7. }
  8. @Test
  9. public void testDeadLock() throws Exception {
  10. reproduceDeadLock(true, false);
  11. }
  12. @Test
  13. public void testDeadLock3() throws Exception {
  14. reproduceDeadLock(false, true);
  15. }
  16. @Test(expected = TimeoutException.class)
  17. public void testDeadLock2() throws Exception {
  18. reproduceDeadLock(false, false);
  19. }
  20. /**
  21. * 1. 预加载driver的方式解决
  22. * 2. 或者加载时对DriverManager提前load解决
  23. * @param preLoadDrivers
  24. * @throws Exception
  25. */
  26. public void reproduceDeadLock(boolean preLoadDrivers, boolean enableSynchronized) throws Exception {
  27. ExecutorService es = Executors.newFixedThreadPool(2);
  28. List<Future<?>> futures = new ArrayList<>();
  29. if (preLoadDrivers) {
  30. DriverManager.getDrivers();
  31. }
  32. CountDownLatch latch = new CountDownLatch(1);
  33. for (String driver: drivers) {
  34. futures.add(es.submit(() -> {
  35. try {
  36. latch.await();
  37. if (enableSynchronized) {
  38. synchronized (DriverManager.class) {
  39. Class.forName(driver);
  40. }
  41. } else {
  42. Class.forName(driver);
  43. }
  44. System.out.println("finished to create " + driver);
  45. } catch (Exception e) {
  46. System.out.println("failed to create " + driver);
  47. e.printStackTrace();
  48. }
  49. }));
  50. }
  51. latch.countDown();
  52. for (Future<?> f : futures) {
  53. f.get(5, TimeUnit.SECONDS);
  54. }
  55. es.shutdown();
  56. }
  57. }

参考资料

https://medium.com/@priyaaggarwal24/avoiding-deadlock-when-using-multiple-jdbc-drivers-in-an-application-ce0b9464ecdf
https://bugs.java.com/bugdatabase/view_bug.do?bug_id=6354348
https://issues.apache.org/jira/browse/DBCP-272