问题发生自一个用户的实时任务,任务大致逻辑是从源表读取数据,和ob维表关联,并将结果写入mysql表中,但是用户启动任务后发现,任务根本无法消费,整个进程卡住了,jstack查看后发现进程一直处于以下这样的堆栈中
"Bolt-node-11:6" #51 prio=10 os_prio=0 tid=0x00002b1454002000 nid=0x676b in Object.wait() [0x00002b142cd06000]
java.lang.Thread.State: RUNNABLE
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at java.lang.Class.newInstance(Class.java:442)
at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:380)
at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
at java.sql.DriverManager$2.run(DriverManager.java:613)
at java.sql.DriverManager$2.run(DriverManager.java:593)
at java.security.AccessController.doPrivileged(Native Method)
at java.sql.DriverManager.loadInitialDrivers(DriverManager.java:593)
at java.sql.DriverManager.<clinit>(DriverManager.java:110)
at com.mysql.jdbc.Driver.<clinit>(Driver.java:61)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:264)
...
"Spout-node-1:2" #44 prio=10 os_prio=0 tid=0x00002b1434009800 nid=0x6765 in Object.wait() [0x00002b142c700000]
java.lang.Thread.State: RUNNABLE
at com.aitozi.xxx.druid.proxy.DruidDriver.registerDriver(DruidDriver.java:92)
at com.aitozi.xxx.druid.proxy.DruidDriver$1.run(DruidDriver.java:84)
at java.security.AccessController.doPrivileged(Native Method)
at com.aitozi.xxx.druid.proxy.DruidDriver.<clinit>(DruidDriver.java:81)
at com.aitozi.xxx.druid.pool.DruidDataSource.init(DruidDataSource.java:563)
at com.aitozi.xxx.datasource.ObServerDataSourceFactory.createDruidDataSource(ObServerDataSourceFactory.java:113)
at com.aitozi.xxx.datasource.ObServerDataSourceFactory.createDruidHolder(ObServerDataSourceFactory.java:40)
at com.aitozi.xxx.datasource.ObServerDataSourceInventory.createDataSourceHolder(ObServerDataSourceInventory.java:247)
at com.aitozi.xxx.datasource.ObServerDataSourceInventory.getDataSource(ObServerDataSourceInventory.java:183)
at com.aitozi.xxx.datasource.ObServerDataSourceInventory.getDataSource(ObServerDataSourceInventory.java:434)
at com.aitozi.xxx.connection.ObGroupConnection.createNewConnection(ObGroupConnection.java:156)
at com.aitozi.xxx.connection.ObGroupConnection.getServerConnetion(ObGroupConnection.java:214)
at com.aitozi.xxx.connection.ObGroupConnection.getAvailConnection(ObGroupConnection.java:271)
at com.aitozi.xxx.statement.ObStatement.getAvailConnection(ObStatement.java:1087)
at com.aitozi.xxx.statement.ObStatement.doExecute(ObStatement.java:614)
at com.aitozi.xxx.statement.ObStatement.execute(ObStatement.java:514)
at com.aitozi.xxx.statement.ObStatement.execute(ObStatement.java:1224)
at com.aitozi.xxx.statement.ObStatement.executeQuery(ObStatement.java:1179)
at com.aitozi.xxx.resource.ObServerStateProcessor.refreshZoneState(ObServerStateProcessor.java:232)
at com.aitozi.xxx.resource.ObServerStateProcessor.doRefreshTask(ObServerStateProcessor.java:133)
at com.aitozi.xxx.resource.ObClusterResourcePool.createClusterResource(ObClusterResourcePool.java:112)
at com.aitozi.xxx.resource.ObClusterResourcePool.syncCreateClusterResource(ObClusterResourcePool.java:66)
at com.aitozi.xxx.datasource.ObGroupDataSource.createClusterResource(ObGroupDataSource.java:708)
at com.aitozi.xxx.datasource.ObGroupDataSource.init(ObGroupDataSource.java:676)
at com.aitozi.xxx.plugin.dim.AbstractOceanBaseDimStore.createDatasource(AbstractOceanBaseDimStore.java:333)
at com.aitozi.xxx.plugin.dim.AbstractOceanBaseDimStore.lambda$connect$5(AbstractOceanBaseDimStore.java:300)
at com.aitozi.xxx.plugin.dim.AbstractOceanBaseDimStore$$Lambda$64/1581344066.run(Unknown Source)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at com.aitozi.xxx.utils.RetryCommand.run(RetryCommand.java:36)
at com.aitozi.xxx.plugin.dim.AbstractOceanBaseDimStore.connect(AbstractOceanBaseDimStore.java:298)
at com.aitozi.xxx.plugin.dim.AbstractOceanBaseDimStore.open(AbstractOceanBaseDimStore.java:114)
at com.aitozi.xxx.dim.DimStoreManager.getKVDimStore(DimStoreManager.java:42)
- locked <0x00000000ee8e2790> (a java.util.HashMap)
at com.aitozi.xxx.runtime.rc.StreamingRuntimeContext.getKVDimStore(StreamingRuntimeContext.java:131)
at com.aitozi.xxx.sql.runtime.runner.dim.OceanBaseDimTable.dimStore(OceanBaseDimTable.java:183)
at com.aitozi.xxx.sql.runtime.runner.StreamJoinTableRunner$StreamJoinTableFn.open(StreamJoinTableRunner.java:243)
at com.aitozi.xxx.api.function.util.FunctionUtils.openFunction(FunctionUtils.java:33)
at com.aitozi.xxx.operator.base.AbstractOperator.setUp(AbstractOperator.java:66)
at com.aitozi.xxx.operator.stream.AbstractStreamOperator.setUp(AbstractStreamOperator.java:58)
at com.aitozi.xxx.operator.stream.StreamFlatMapOperator.setUp(StreamFlatMapOperator.java:26)
at com.aitozi.xxx.runtime.stream.processor.AbstractSProcessor.operatorInit(AbstractSProcessor.java:115)
at com.aitozi.xxx.runtime.stream.processor.AbstractSProcessor.operatorInit(AbstractSProcessor.java:119)
at com.aitozi.xxx.runtime.stream.processor.AbstractSProcessor.operatorInit(AbstractSProcessor.java:119)
at com.aitozi.xxx.runtime.stream.processor.AbstractSProcessor.open(AbstractSProcessor.java:109)
at com.aitozi.xxx.runtime.stream.engine.storm.KeplerSpout.open(KeplerSpout.java:109)
at com.aitozi.xxx.transactional.spout.ScheduleTransactionSpout.open(ScheduleTransactionSpout.java:47)
at com.aitozi.xxx.task.execute.spout.SpoutExecutors.init(SpoutExecutors.java:108)
at com.aitozi.xxx.task.execute.spout.MultipleThreadSpoutExecutors.init(MultipleThreadSpoutExecutors.java:52)
at com.aitozi.xxx.task.execute.BaseExecutors.initWrapper(BaseExecutors.java:235)
at com.aitozi.xxx.task.execute.BaseExecutors.run(BaseExecutors.java:262)
at com.aitozi.xxx.callback.AsyncLoopRunnable.run(AsyncLoopRunnable.java:94)
at java.lang.Thread.run(Thread.java:882)
这里只截取了两段有问题的堆栈,以上两个线程分别在执行MysqlWriter的初始化和Ob维表的初始化,我们后面简称为Mysql线程和Ob线程,
我们可以看到Mysql线程一直卡在了newInstance,对象创建上,和Ob线程并没有显示的锁竞争,那么为什么会导致进程卡死了呢?下面我们来简单分析下两个线程的执行流程
Mysql线程执行流程
- com.aitozi.xxx.plugins.output.mysql.ng.MySQLWriter#doConnect
protected void doConnect() throws Exception {
Class.forName("com.mysql.jdbc.Driver");
connection = DriverManager.getConnection(url, user, password);
...
}
这里我们可以看到是调用了Class.forName(), 显示的进行了mysql driver的初始化,继续track jdk内部方法,得到以下的调用链
com.mysql.jdbc.Driver
static {
try {
java.sql.DriverManager.registerDriver(new Driver());
} catch (SQLException E) {
throw new RuntimeException("Can't register driver!");
}
}
mysql Driver的静态块初始化, 调用DriverManager静态方法注册driver,触发DriverManager初始化
java.sql.DriverManager.
/**
* Load the initial JDBC drivers by checking the System property
* jdbc.properties and then use the {@code ServiceLoader} mechanism
*/
static {
loadInitialDrivers();
println("JDBC DriverManager initialized");
}
这里是尝试load所有的driver,这里是通过spi的方式(一种java的服务发现机制),简单的说就是在classpath下的META-INF/services/目录里创建一个以服务接口命名的文件这里就是java.sql.Driver,这个文件里的内容就是这个接口的具体的实现类。当程序需要这个服务的时候,就可以通过查找META-INF/services/中的配置文件,配置文件中有接口的具体实现类名,可以根据这个类名进行加载实例化,就可以使用该服务了。线上的java.sql.Driver文件内容如下
$ cat java.sql.Driver
com.alibaba.druid.proxy.DruidDriver
com.alibaba.druid.mock.MockDriver
com.mysql.jdbc.Driver
com.mysql.fabric.jdbc.FabricMySQLDriver
com.alipay.oceanbase.obproxy.druid.proxy.DruidDriver
com.alipay.oceanbase.obproxy.druid.mock.MockDriver
org.apache.calcite.jdbc.Driver
org.apache.calcite.avatica.remote.Driver
oracle.jdbc.OracleDriver
因此这里会去按顺序尝试创建各个Driver实例,到这里Mysql线程block在了DruidDriver的实例创建上
Ob线程执行流程
- com.aitozi.xxx.datasource.ObGroupDataSource.init
protected void createDatasource() throws Exception {
if (obDataSource == null) {
obDataSource = new ObGroupDataSource();
...
obDataSource.setUrl(this.url);
obDataSource.setFullUsername(this.fullUserName);
obDataSource.setPassword(this.password);
obDataSource.setDatabase(this.datebase);
obDataSource.init();
}
}
- com.aitozi.xxx.druid.pool.DruidDataSource.init 进行DruidDataSource初始化
com.aitozi.xxx.druid.proxy.DruidDriver.
druidDriver的静态块初始化 this.id = DruidDriver.createDataSourceId();
调用registerDriver开始进行注册,到这里ob线程也卡住了
DriverManager.registerDriver(driver);
到这里可能还不太能看出两个线程block在哪里了,只是隐隐感觉都和Driver初始化有关,也都和DriverManager有关,下面的流程图能够一目了然的看出互相block在什么地方
可以看到:
Mysql线程在等待Ob线程完成DruidDriverClass 加载
Ob线程在等待Mysql线程完成DriverManager Class加载
到这里谜底就揭开了,在堆栈信息中并没有显示的lock,是因为两个线程被对方的class初始化时的类锁给block住了
如何解决
其实这个问题发生的主要原因是因为DriverManager在类加载时会通过spi机制去扫描service类进行加载,一种方式是在创建DataSource或者显示加载driver的时候对DriverManager.class类加锁,或者在进程启动前暴力点直接先进行加载
DriverManager.getDrivers();
附录中DBCP的一个issue提到的解法也是这个,我在代码中没有找到合适的位置直接进行加载,故采用了第一种解法去解决
复现case
public class DriverManagerDeadLockTest {
List<String> drivers = new ArrayList<>();
@Before
public void addDrivers() {
drivers.add("com.mysql.jdbc.Driver");
drivers.add("com.aitozi.xxx.druid.proxy.DruidDriver");
}
@Test
public void testDeadLock() throws Exception {
reproduceDeadLock(true, false);
}
@Test
public void testDeadLock3() throws Exception {
reproduceDeadLock(false, true);
}
@Test(expected = TimeoutException.class)
public void testDeadLock2() throws Exception {
reproduceDeadLock(false, false);
}
/**
* 1. 预加载driver的方式解决
* 2. 或者加载时对DriverManager提前load解决
* @param preLoadDrivers
* @throws Exception
*/
public void reproduceDeadLock(boolean preLoadDrivers, boolean enableSynchronized) throws Exception {
ExecutorService es = Executors.newFixedThreadPool(2);
List<Future<?>> futures = new ArrayList<>();
if (preLoadDrivers) {
DriverManager.getDrivers();
}
CountDownLatch latch = new CountDownLatch(1);
for (String driver: drivers) {
futures.add(es.submit(() -> {
try {
latch.await();
if (enableSynchronized) {
synchronized (DriverManager.class) {
Class.forName(driver);
}
} else {
Class.forName(driver);
}
System.out.println("finished to create " + driver);
} catch (Exception e) {
System.out.println("failed to create " + driver);
e.printStackTrace();
}
}));
}
latch.countDown();
for (Future<?> f : futures) {
f.get(5, TimeUnit.SECONDS);
}
es.shutdown();
}
}
参考资料
https://medium.com/@priyaaggarwal24/avoiding-deadlock-when-using-multiple-jdbc-drivers-in-an-application-ce0b9464ecdf
https://bugs.java.com/bugdatabase/view_bug.do?bug_id=6354348
https://issues.apache.org/jira/browse/DBCP-272