diff --git a/documents/academic/paper/urban-economics-transport-productivity-p03.jpg b/documents/academic/paper/urban-economics-transport-productivity-p03.jpg new file mode 100644 index 0000000..134f93a Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p03.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p04.jpg b/documents/academic/paper/urban-economics-transport-productivity-p04.jpg new file mode 100644 index 0000000..c272e88 Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p04.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p07.jpg b/documents/academic/paper/urban-economics-transport-productivity-p07.jpg new file mode 100644 index 0000000..255953e Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p07.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p08.jpg b/documents/academic/paper/urban-economics-transport-productivity-p08.jpg new file mode 100644 index 0000000..f394ddb Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p08.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p10.jpg b/documents/academic/paper/urban-economics-transport-productivity-p10.jpg new file mode 100644 index 0000000..f6ca889 Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p10.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p12.jpg b/documents/academic/paper/urban-economics-transport-productivity-p12.jpg new file mode 100644 index 0000000..53b17ac Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p12.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p14.jpg b/documents/academic/paper/urban-economics-transport-productivity-p14.jpg new file mode 100644 index 0000000..7fc505c Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p14.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p15.jpg b/documents/academic/paper/urban-economics-transport-productivity-p15.jpg new file mode 100644 index 0000000..6fb014a Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p15.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p17.jpg b/documents/academic/paper/urban-economics-transport-productivity-p17.jpg new file mode 100644 index 0000000..dd62957 Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p17.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p19.jpg b/documents/academic/paper/urban-economics-transport-productivity-p19.jpg new file mode 100644 index 0000000..2a0f8c5 Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p19.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p25.jpg b/documents/academic/paper/urban-economics-transport-productivity-p25.jpg new file mode 100644 index 0000000..ad3c1cc Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p25.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p26.jpg b/documents/academic/paper/urban-economics-transport-productivity-p26.jpg new file mode 100644 index 0000000..ed7c914 Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p26.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p29.jpg b/documents/academic/paper/urban-economics-transport-productivity-p29.jpg new file mode 100644 index 0000000..7f9db92 Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p29.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p30.jpg b/documents/academic/paper/urban-economics-transport-productivity-p30.jpg new file mode 100644 index 0000000..f5d0056 Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p30.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity-p31.jpg b/documents/academic/paper/urban-economics-transport-productivity-p31.jpg new file mode 100644 index 0000000..a724205 Binary files /dev/null and b/documents/academic/paper/urban-economics-transport-productivity-p31.jpg differ diff --git a/documents/academic/paper/urban-economics-transport-productivity.md b/documents/academic/paper/urban-economics-transport-productivity.md new file mode 100644 index 0000000..446489c --- /dev/null +++ b/documents/academic/paper/urban-economics-transport-productivity.md @@ -0,0 +1,469 @@ +--- +type: academic-paper +category: urban economics +person: unknown (working paper draft with reviewer annotations) +date: 2017 +source: + - urban-economics-transport-productivity-p03.jpg + - urban-economics-transport-productivity-p04.jpg + - urban-economics-transport-productivity-p07.jpg + - urban-economics-transport-productivity-p08.jpg + - urban-economics-transport-productivity-p10.jpg + - urban-economics-transport-productivity-p12.jpg + - urban-economics-transport-productivity-p14.jpg + - urban-economics-transport-productivity-p15.jpg + - urban-economics-transport-productivity-p17.jpg + - urban-economics-transport-productivity-p19.jpg + - urban-economics-transport-productivity-p25.jpg + - urban-economics-transport-productivity-p26.jpg + - urban-economics-transport-productivity-p29.jpg + - urban-economics-transport-productivity-p30.jpg + - urban-economics-transport-productivity-p31.jpg +--- + +# Urban Economics Working Paper: Transportation Infrastructure, Productivity Shocks, and City Evolution + +> Note: This transcription covers 15 scanned pages (pages 3, 4, 7, 8, 10, 12, 14, 15, 17, 19, 25, 26, 29, 30, 31) of a working paper draft with handwritten annotations/comments. Pages not scanned are omitted. The paper appears to be a monocentric urban model studying how city-specific productivity and transportation shocks affect urban configuration, land rents, and property prices. + +--- + +## Page 3 + +talization rate of the land changes over time, and is different in different locations within the city. + +To summarize, this paper extends the monocentric urban model in a number of important ways. Most notably, we are the first to provide a stochastic dynamic model that captures the transmission of productivity shocks through an urban property market. We are not, however, the first to consider dynamic urban models. For example, Berliant and Wang (2005) reviews an expanding literature that considers the link between agglomeration spillovers, capital accumulation and urban growth.^2 In contrast to most of the prior literature, which focuses on the determinants of urban growth, our focus is on how the structure of cities affects the evolution of property prices and rents. There is clearly a relation between the growth rate of an urban economy and the growth rate of land rents and prices, however, as we show, the economic growth rate is just one of several determinants of price changes. + +We are also not the first to study the relation between transportation infrastructure and urban design. The impacts of automobile and highways are studied in Downs (1992), Dunphy (1997), Glaeser and Kahn (2004), Nechyba and Walsh (2004), Baum-Snow (2007a), Ahlfeldt and Wendland (2011), Garcia-Lopez (2012), and Duranton and Turner (2012). In particular, Baum-Snow (2007b) uses a monocentric city model to show theoretically that the construction of new highways in a city cause the population to spread out along the highways. The impacts of rail transit are studied in Baum-Snow and Kahn (2005), Anderson (2007) and Gonzalez-Navarro and Turner (2016). Baum-Snow et al. (2016) empirically studies the effects of both road and rail construction in China. LeRoy and Sonstelie (1983) uses a standard Alonso-Muth model to explain the suburbanization and gentrification of city centers. + +A key difference between the existing literature and our contribution is that we focus on city-specific changes in transportation infrastructure and productivity, holding the transportation infrastructure and productivity in other cities fixed. While a universal improvement in transportation technology reduces density and flattens the land value and population gradients, as shown in the above references, city-specific transportation improvements lead to higher density and, in most cases, steeper price and population gradients. This is because the city-specific improvements attract workers from other cities, reinforcing the agglomeration effect and increasing land values. + +We examine how the city-specific productivity shocks affect city evolution. This relates our paper to a recent strand of literature that emphasize the importance disaggregated shocks which are city- or region-specific to a large extent. Firm level shocks are studied in Gabaix (2011) which argues that idiosyncratic shocks to large firms can generate non-trivial aggregate + +^2 Duranton and Puga (2014) provides a very thorough review of this and other literature that examines the determinants of urban growth. Most of this literature considers static models, but provides insights about the determinants of growth by examining various comparative statics. + +--- + +## Page 4 + +fluctuations. Caliendo et al. (2014) + +Given a productivity increase, our theoretical framework predicts that cities with better capacity in terms of accommodating in-migration of workers respond with a large expansion of population and a mild increase of wage and rent, while cities with limited capacity respond with a mild expansion of population and a large increase of wage and rent. This is consistent with the empirical findings in Glaeser et al. (2006), although city capacity is measured by the elasticity of housing supply in Glaeser et al. (2006) while in our paper it is reflected in the transportation infrastructure and the flexibility of residential land supply. + +Within a monocentric city framework, our paper studies how the trade-off between agglomeration and congestion/commuting depends on the transport technology. The main message is that better transport technology is conducive to sustained growth -- it weakens the negative externality from congestion and commuting, and strengthens agglomeration. A related work regarding sustained urban growth is Berliant and Wang (2008) that relates the "perpetual" steady-state growth to the endogenous rise of sub-centers of cities. Following the economic growth literature, Berliant and Wang (2008) emphasizes capital accumulation and agglomeration is a function of aggregate capital in a city. In contrast our model highlights that city population is a key determinant of transport cost. Therefore, we model agglomeration as a function of population. + +The rest of the paper is organized as follows. Section 2 reviews the literature and highlights our contribution. Section 3 lays out the model and defines the equilibrium. Section 4 characterizes urban configuration dynamics through a set of propositions. Section 5 presents numerical results that show the full dynamics. Section 6 concludes. + +NOT SURE IF WE WANT TO DISCUSS THE CONTROVERSY ABOUT RAIL TRANSIT HERE The effects of rail transit on employment and social welfare are controversial. The development of rail transit has been under considerable debate in the policy community.^3 Affuso et al. (2003) argue that road improvements have substantially higher returns than railway schemes. Winston and Maheshri (2007) further argue that rail construction reduces social welfare and hence not desirable. + +Other papers potential consequence of ignoring durable housing: +"Urban Decline and Durable Housing", the asymmetry between urban decline and urban growth due to durable housing -- homes can be built quickly, but disappear slowly. + +shock propagation: +"NETWORKS AND THE MACROECONOMY: AN EMPIRICAL EXPLORATION", shows empirically that firm level or industry level shocks are propagated through economic networks and geographic networks. + +^3 See a recent summary in Litman (2014) + +--- + +## Page 7 + +The distance $j_i$ is simply a non-linear transformation of the location index $i$, so without loss of generality, we use $j$ to denote both distance and location, with $j = 0$ representing the CBD where the distance is zero. + +We assume that transportation costs represent lost time and are thus linear in wages. Specifically, for workers living in location $j$ to earn the net reservation wage of $W$, they need to earn a wage of $w = W \times e^{f(j,N,\tau)}$. In other words, the net wage of workers living in location $j$ is + +$$W(j) = w \times e^{-f(j,N,\tau)} \qquad (2)$$ + +The cost function $f(j, N, \tau)$ satisfies + +- $j$ = location defined by distance to the CBD +- $N$ = city population +- $\tau$ = transport technology + +Different transport technologies are characterized by different transport cost functions. For example, we assume that a car-based transport technology is more sensitive to population increases than a rail-based technology because highways are more prone to congestion than rails. We make the following assumptions regarding the alternative transportation technologies: + +**Assumptions** Needs assumptions about differentiability + +1. $f(0, N, \tau) = 0$, i.e., transport cost is zero for the workers living in the CBD. This is consistent with the definition of distance in equation (1). + +2. $\partial f / \partial N > 0$, i.e., there is a positive congestion effect, such that the cost increases with city population. + +3. $\partial f / \partial j > 0$, i.e., the transport cost gradient, defined as the change of cost with distance to CBD, is positive. + +4. $\partial^2 f / \partial j \partial N = 0$, i.e. the congestion effect is not location-specific.^4 + +Transport cost gradient is of key importance in our analysis, for simplicity of notation we use $f'(j|N, \tau)$ to denote transport cost gradient at location $j$, given population $N$ and technology $\tau$. + +^4 We make this assumption for transparency of the main mechanisms to be developed. Most of the results hold true if we allow the congestion effect to increase with distance from the CBD ($\partial f^2 / \partial j \partial N \geq 0$), which is shown in the technical appendix. + +--- + +## Page 8 + +### 2.2 The Workers + +The model assumes a mass of identical workers who each provide exactly one unit of labor and allocate their wages to land rent, transportation costs, and consumption goods. For notational convenience, we will define a net wage, which is the wage minus the workers cost of commuting to work. Since the workers are identical and have the same external opportunities they each receive the same exogenous reservation level of utility regardless of where they live. + +#### 2.2.1 The Worker's Optimization Problem + +Let $h$ and $c$ be residential land and consumption goods respectively, a worker who lives at location $j$ solves the following optimization problem. + +$$\max_{c,h} = u(c, h)$$ + +$$s.t.$$ + +$$c + p_r(j)h = W(j) \qquad (3)$$ + +where $p_r(j)$ is the rental rate of residential land in location $j$, and $W(j)$ is the net wage as defined in equation (2). + +It is straightforward to show that the optimal allocation between land and consumption goods satisfies, + +$$p_r(j) = \frac{\partial u(c,h)/\partial h}{\partial u(c,h)/\partial c} \qquad (4)$$ + +The right side of the above equation is the marginal rate of substitution between land and the consumption good. Given the assumed Cobb-Douglas utility function, i.e., $u(c, h) = c^{1-\theta} h^{\theta}$, equation (4) becomes + +$$p_r(j) = \frac{\theta}{1-\theta} \frac{c}{h} \qquad (5)$$ + +From equation (5), we get $c = \frac{1-\theta}{\theta} p(d)h$. Substituting this into the budget constraint (equation (3)) yields the optimal consumption good choice, + +$$c = (1-\theta)W(j) \qquad (6)$$ + +and land demand function + +$$h = \theta \frac{W(j)}{p_r(j)} \qquad (7)$$ + +Workers also choose where to live, but since they all receive the same reservation utility, the location choice is irrelevant for individual workers in equilibrium. Indeed, the rental price of land at each location is determined within the equilibrium to make all workers indifferent. + +--- + +## Page 10 + +rent as a function of the number of workers living in given location. Combining equation (9) and (11), the relation between land rent and population in location $j$ is + +$$p_r(j) = B_0^{-\frac{\theta}{1-\theta}} \theta^{\frac{1}{1-\theta}} n(j)^{\frac{1}{1-\theta}} \qquad (12)$$ + +Thus, residential land rent is an increasing function of population in each location driven by the net wage. + +#### 2.2.3 Rent Gradient + +The residential rent gradient describes the land rent as a function of the distance from the CBD, i.e., + +$$p'_r(j) = -\frac{B_0}{\theta} W(j)^{1/\theta} f'(j|N, \tau) \qquad (13)$$ + +The rent gradient depends on the transport cost gradient $f'(j|N, \tau)$. All else equal, a lower transport cost gradient generates lower rents at each distance from the CBD. However, in our model, since reservation utility is assumed to be fixed, lower transport costs must be offset by something that keeps reservation utility constant. For example, if a city is endowed with a new technology that exogenously lowers its transportation costs, its population will increase, which endogenously increases the transport cost gradient because we assume $\partial^2 f / \partial j \partial N \geq 0$. + +### 2.3 The Firms + +There exists a unit measure of identical firms that use land in the citys CBD along with labor to produce the consumption good. We assume a constant returns to scale Cobb-Douglas production function, + +$$F(\ell, n) = A \ell^{\sigma} n^{1-\sigma} \qquad (14)$$ + +where $\ell_t$ and $n_t$ are land and labor respectively. $A$ is the total factor productivity (TFP) of this city relative to other cities.^6 + +Let $p_c$ be the rental rate of commercial land, the firm's optimization problem is + +$$\max_{\ell, n} F(\ell, n) - wn - p_c \ell$$ + +subject to equation (14). From the first-order condition, we obtain the usual allocation rule of a Cobb-Douglas production function: + +$$\frac{n}{\ell} = \frac{1-\sigma}{\sigma} \frac{p_c}{w} \qquad (15)$$ + +^6 A is endogenous in the dynamic setting due to the positive externality of agglomeration. + +--- + +## Page 12 + +is constant the individuals at the periphery consume the same mix of land and the consumption good regardless of the productivity and transport technology. As we note below, this will not be the case when the firm has a fixed boundary. + +**Fixed Boundaries** When the city's boundary is fixed, the land price at the periphery is determined by the bid-rent function. + +### 2.5 General Equilibrium + +The model has three interdependent markets that must clear simultaneously: the commercial land market, the residential land market and the labor market. In this setting, firms make zero profit and households receive their reservation utility. As such, as in Fujita (1989), Wheaton (1998) and Rossi-Hansberg (2004), the social optimum occurs when aggregate rent is maximized. + +#### 2.5.1 Land Market Equilibrium + +Equilibrium in the residential land market implies a relation between wage and population as shown in the curves labeled "Land Eqlm" in figure 1. A higher wage attracts a larger population to maintain land market equilibrium, leading to an upward sloping land market equilibrium curve. Intuitively, a higher wage leads to a higher residential bid-rent, which causes more agricultural land in the periphery to be converted to urban land. The higher rents lead to lower land demand per worker (equation 10). + +\*\*I find this a bit confusing ?since we are considering the interaction between 3 endogenous variables. To me it is easier to talk in terms of shifts in the exogenous productive variable. An increase in productivity increases wages, which in turn attracts more workers, which in turn increases rents, causing agricultural land to convert to urban land. + +We will refer to this relation between wage and population as the land market equilibrium condition. To formally present the condition, we start with equation (11) that shows the number of workers that are accommodated in location $j$ given the effective wage $W(j)$. Aggregating workers in each location leads to: + +$$N = \int_0^{J(w)} n(j) dj$$ + +$$= S_r \frac{B_0}{\theta} w^{(1-\theta)/\theta} dj + \int_1^{J(w)} \frac{B_0}{\theta} W(j)^{(1-\theta)/\theta} dj$$ + +$$= S_r \frac{B_0}{\theta} w^{(1-\theta)/\theta} dj + \frac{B_0}{\theta} w^{\frac{1-\theta}{\theta}} \int_1^{J(w)} e^{-\frac{1-\theta}{\theta} f(j,N,\tau)} dj \qquad (18)$$ + +Equation (18) is the relation between wage and population when residential land market clears, i.e. all the residential land available given wage $w$ is occupied by workers. Note that the + +--- + +## Page 14 + +This labor demand equation can be rewritten into + +$$w = (1-\sigma) A \left(\frac{S_c}{N}\right)^{\sigma}, \qquad (20)$$ + +where we have replaced $N^*$ with the equilibrium total employment $N$. Since workers have perfect mobility and the reservation utility is identical across cities, equation is also the labor market equilibrium condition when the CBD is segmented. + +\*\* Im a little bit confused about terminology. We can specify a labor demand function, which specifies the amount of labor the firm will hire as a function of the wage and the commercial rent. This function is going to be the same in both the segmented and non-segmented CBDs. Of course, the equilibrium rents and wages will be different depending on the segmentation. I think this is what you say below. So would it be better to first specify that labor demand function and then separately look at the two cases? + +**Non-segmented CBD** In this case, equation 2.5.2 is still the land demand function, but $S_c$ now changes with wage rate, and there is an additional condition for labor market to clear: the equality of commercial and residential land rents. Using the residential bid-rent function (equation 9) and the commercial bid-rent function (16), $p_r = p_c$ implies + +$$\left[\frac{A\sigma^{\sigma}(1-\sigma)^{1-\sigma}}{w^{1-\sigma}}\right]^{1/\sigma} = B_0 w^{1/\theta} \qquad (21)$$ + +where we used the condition $f(j, N, \tau) = 0$ for $j = 0$. + +Denote $\Psi = \frac{\sigma^{\sigma}(1-\sigma)^{1-\sigma}}{B_0^{\sigma}}$, Equation (21) can be re-written into + +$$w = (\Psi A)^{\theta/(\sigma+\theta-\theta\sigma)} \qquad (22)$$ + +Equation (22) represents the labor market equilibrium curve in the case of non-separable land markets. Given a productivity, there is only one wage that satisfies $p_r = p_c$. The slight rise of wage above this fixed level leads to a higher residential land rent and lower commercial land rent, which causes the entire commercial land to be converted into residential use. Therefore the labor market equilibrium curve is flat. + +#### 2.5.3 Equilibrium Wage, Population and Commercial Land + +The equilibrium wage, population and size of commercial land are determined when both the land market and the labor market clear. Equilibrium wage and population are determined by the intersection of the land market equilibrium curve and labor market equilibrium curve. The equilibrium is illustrated in figure (1), where land and labor market equilibrium curves are labeled "Land Eqlm" and "Labor Eqlm", respectively. The left panel is the graphical representation of equations (18) and (22), and the right panel of equations (18) and (19). + +--- + +## Page 15 + +**Figure 1: Labor Market and Land Market Equilibria** + +``` +CBD not segmented CBD segmented + + | Land Eqlm | Land Eqlm + | / | / + |/ |/ +w --|--------- Labor Eqlm w --|--------\ + | | \ Labor Eqlm + | | + +----------→ N +----------→ N +``` + +Note: This figures shows the equilibrium wage and population in cities with non-segmented CBD (left panel) and segmented CBD (right panel). The "Land Eqlm" curve represents land market equilibrium and the $N^d$ curve represents labor demand of the firms. + +Given the equilibrium wage and population, land rents (both commercial and residential) are easily calculated from the bid-rent functions. In the case of non-segmented CBD, the size of commercial land ($S_c$) is solved from equation (2.5.2). + +#### 2.5.4 Uniqueness of Equilibrium + +In each of the four case shown in table 1, the endogenous variables are uniquely determined, hence the model has an unique equilibrium. + +To show the uniqueness, we start from the case of a city with a segmented CBD and fixed boundaries. The model has the following endogenous variables: $(w, N, p_r, p_c)$ that are solved from equations (9), 16, (18), and (19), namely the residential bid-rent function, commercial bid-rent function, land market equilibrium condition and labor demand equation. Clearly the latter two equations uniquely determine wage and population, and land rents are uniquely determined by bid-rent functions. Therefore, the equilibrium is unique. + +Turning to the case of non-segmented CBD and fixed boundaries, we have one more endogenous variable: the size of commercial land, and one more equilibrium condition: $p_c = p_r$.^7 Therefore we have a system of five equations and five unknowns. As shown in figure 1, wage and population are uniquely determined by the equilibrium conditions in the land and labor markets. The remaining three endogenous variables are uniquely determined by wage and population. When the city's boundaries can expand, we also need to solve for the city boundary $J$ which is uniquely determined by wage for any given transportation technology, as shown in + +^7 The size of residential land in the CBD is simply the difference between size of the CBD and size of the commercial land. + +--- + +## Page 17 + +may be higher. It should also be noted that since the workers are richer, they may consume more land, which would increase rents. + +Consequently, the bid-rent function of residential land needs to be modified. Landlords no longer simply charge a rent that guarantees a constant reservation utility, but they can charge higher rents (i.e. deliver lower utility) when the city receives negative productivity shocks, and charge lower rents for positive productivity shocks. This effectively dampens the volatility of population in two ways. Taking the higher rent in the face of a negative shock as an example, first, the high rent maintains sufficient land provision to workers, so that the land market equilibrium curve does not shift leftward. Second, in the case of non-segmented land markets, commercial land also remains relatively high so that firms do not substitute out labor for land. Since the city-level productivity depends on the lagged population, the lower population volatility implies lower volatilities of productivity, wage, and other dimensions of urban configuration. + +### 3.1 Worker's problem + +For simplicity, we consider only productivity shocks in one city where we refer to existing workers as "incumbents" and workers outside the city as "outsiders". The one-period indirect utility of incumbents is $u(\tilde{A})$, which is a function of the productivity shock, and the one-period utility of the outsiders is $u^*$, which is assumed to be constant (i.e., the reservation utility in the basic model). A worker takes the present and discount sum of the future utilities into account when making the moving decisions. Specifically, given a productivity shock $\tilde{A}$, an incumbent compares between the values of staying and moving, denoted $V^{stay}(\tilde{A})$ and $V^{move}(\tilde{A})$ respectively. Therefore the value of an incumbent given $\tilde{A}$ is: + +$$V(\tilde{A}) = \max\{V^{stay}(\tilde{A}), V^{move}(\tilde{A})\}. \qquad (23)$$ + +Similarly, an outsider compares the values of staying and moving, denoted $W^{stay}(\tilde{A})$ and $W^{move}(\tilde{A})$ respectively. The value of an outsider given the productivity shock $\tilde{A}$ is + +$$W(\tilde{A}) = \max\{W^{stay}(\tilde{A}), W^{move}(\tilde{A})\} \qquad (24)$$ + +where $\tilde{A}$ is NOT the productivity shock of the cities where the outsiders live, because we assume no shock happens to cities of outsiders. + +The functions of $V^{stay}(\tilde{A})$, $V^{move}(\tilde{A})$, $W^{stay}(\tilde{A})$, and $W^{move}(\tilde{A})$ have the following recursive representations: + +$$V^{stay}(\tilde{A}) = u(\tilde{A}) + \beta EV(\tilde{A}') \qquad (25)$$ + +$$V^{move}(\tilde{A}) = (1-\delta)u^* + \beta EW(\tilde{A}') \qquad (26)$$ + +$$W^{stay}(\tilde{A}) = u^* + \beta EW(\tilde{A}') \qquad (27)$$ + +$$W^{move}(\tilde{A}) = (1-\delta)u(\tilde{A}) + \beta EV(\tilde{A}') \qquad (28)$$ + +--- + +## Page 19 + +Therefore, we have + +$$V(\tilde{A}) = \max\{V^{stay}(\tilde{A}), V^{move}(\tilde{A})\}$$ + +$$= V^{stay}(\tilde{A})$$ + +$$= u(\tilde{A}) + \beta EV(\tilde{A}')$$ + +Taking expectation with respect to the productivity shock yields + +$$EV(\tilde{A}) = Eu(\tilde{A}) + \beta EV(\tilde{A}'). \qquad (29)$$ + +Since the productivity shocks are i.i.d, $EV(\tilde{A}) = EV(\tilde{A}')$. Equation (29) becomes + +$$EV(\tilde{A}') = \frac{1}{1-\beta} Eu(\tilde{A}). \qquad (30)$$ + +Similarly, $W^{stay}(\tilde{A}) \geq W^{move}(\tilde{A})$ should hold for any $\tilde{A}$. Otherwise every outsiders would choose to move, which is inconsistent with the labor demand function and the residential land market equilibrium. Therefore, $W(\tilde{A}) = \max\{W^{stay}(\tilde{A}), W^{move}(\tilde{A})\} = W^{stay}(\tilde{A}) = u^* + \beta EW(\tilde{A})$. Taking expectation with respect to $\tilde{A}$ leads to + +$$EW(\tilde{A}') = \frac{1}{1-\beta} u^*. \qquad (31)$$ + +### 3.3 One-period Utility as a Function of Productivity Shock + +Using the equilibrium condition $EV(\tilde{A}) = EW(\tilde{A})$, equation (30)-(31) imply + +$$Eu(\tilde{A}) = u^*, \qquad (32)$$ + +i.e. the one-period indirect utility of incumbents equals the reservation utility of outsiders on average. + +Now we study how $u(\tilde{A})$ is determined for each individual $\tilde{A}$. First of all, everything else equal, $u(\tilde{A})$ should be non-decreasing with $\tilde{A}$. Intuitively, labor demand falls when a negative $\tilde{A}$ occurs. If the utility level rises when $\tilde{A}$ is negative, then labor supply cannot fall, and it is impossible for the labor market to clear. + +Given that utility is non-decreasing in productivity, the response of the economy to a negative productivity shock is illustrated in figure 2. The original equilibrium is point $E_1$ where the curves of land and labor market equilibria intersect. A negative productivity shock shifts the "Labor Eqlm" curve leftward, reaching point $E_2$ with the wage and population pair falls from $(w_1, N_1)$ to $(w_2, N_2)$. This is the new equilibrium in the baseline model where the reservation utility is fixed. However, when the utility level falls with productivity, the "Land Eqlm" curve shifts to the right because the lower utility leads to a higher bid-rent, which not only causes + +--- + +## Page 25 + +in cities with fixed boundaries. Consequently the firms are not able to have land rent lowered hence not able to compensate workers for the congestion. On the other hand, if land markets are separable, owners of commercial land are more willing to reduce rent, hence firms are able to compensate for the congestion. Clearly, the situation of compensated congestion should have higher elasticities of wage and population relative to the case of un-compensated congestion. Since the compensation comes from lowered commercial land rent, the $\zeta_{p_c}$ is smaller in the case of separable land markets (the case of compensated congestion). + +**Segmented CBD Versus Nonsegmented CBD (In Open Cities)** + +**Proposition 4** *Consider open cities, let $\zeta^*$ be the elasticity in a city with segmented CBD, and $\zeta$ be the elasticity in a city with non-segmented CBD, then:* + +1. *$\zeta_N^* \geq \zeta_N$, $\zeta_w^* \geq \zeta_w$, $\zeta_{P_r}^* \geq \zeta_{P_r}$, and $\zeta_{p_c}^* \leq \zeta_{p_c}$ for cities with a high gradient of transport cost and a strong congestion effect (e.g. car-based cities)* + +2. *the inequalities are all reversed for cities with a low gradient of transport cost and a weak congestion effect (rail-based cities).* + +Proof is in Appendix D + +The proposition has important implications regarding whether residential and commercial land markets should be separated. On the one hand, for rail-based cities (with variable boundaries), non-separation of land markets policy is suggested as it is more conducive to growth. Intuitively, rail-based cities are able to expand their peripheries, so allowing the conversion of residential land near the CBD into commercial use effectively lowers commercial land rent, causing more growth in population and wage. On the other hand, for car-based cities, expansion in the peripheries is economically less viable, so residential land rent near the CBD is relatively more expensive. Consequently, under the non-separation policy owners of commercial land are unwilling to reduce rent to compensate for the congestion. Therefore for case-based cities our suggestion is to separate the commercial and residential land markets. + +#### 4.1.2 Rent Growth and Cap Rate + +**Proposition 5** *The cross derivative $\frac{\partial^2 p_r(j)}{\partial j \partial A}$ is less than zero and its absolute value increases with the congestion effect ($\frac{\partial f}{\partial N}$), implying:* + +1. *a positive (negative) productivity shock causes steeper (flatter) land gradient;* + +2. *a positive (negative) productivity shock causes more rise (fall) of land rent near the CBD and less rise (fall) in far out areas.* + +--- + +## Page 26 + +*Furthermore, the above two effects are stronger in cities with stronger congestion effect.* + +Proof is given in Appendix E. Thus rent of land near the CBD is more responsive to productivity shocks. Given a positive productivity shock, land near the CBD has a higher growth of rent compared with farther-out land, particularly for cities with strong congestion effect. + +The growth rate of land rent is closely related to the capitalization rate of land and property. Everything else equal, higher growth rate implies lower capitalization rate. We consider the simplest case where land price is the discount sum of future rent which grows at a constant factor $G_p(j)$. Using $P(j)_0$ to denote land price in location $j$ at time 0, then + +$$P_0(j) = \sum_{t=0}^{\infty} \beta^t p_{r_t}(j),$$ + +$$= \sum_{t=0}^{\infty} \beta^t p_{r_0}(j) G_p(j)^t,$$ + +$$= p_{r_0}(j) \sum_{t=0}^{\infty} (\beta G_p(j))^t,$$ + +$$= \frac{p_{r_0}(j)}{1 - \beta G_p(j)}$$ + +Therefore the capitalization rate at location $j$ is + +$$CapRate = \frac{p_{r_0}(j)}{P_0(j)},$$ + +$$= 1 - \beta G_p(j),$$ + +i.e., the capitalization rate decreases with the growth rate of rent. + +Within our framework, the changes of rent are caused by productivity shock. If a city receives positive productivity shocks and thus population and wage increase for a lengthy period of time, then land near the CBD consistently has higher growth rate of rent than land in farther-out locations according to *Proposition 5*. It is well recognized that the rise and fall of cities are highly persistent.^10 In light of this, when a city rises land near the CBD has lower capitalization rate. More specifically, we have the following corollary to *Proposition 5*. + +**Corollary** *If productivity shocks cause sufficiently persistent rise and fall of cities, then:* + +1. *When a city rises, the capitalization rate of residential land increases with the distance to the CBD. The rate of increase is larger for cities with larger $\frac{\partial f}{\partial N}$.* + +2. *When a city falls, the capitalization rate of residential land decreases with the distance to the CBD. The rate of decrease is larger for cities with larger $\frac{\partial f}{\partial N}$.* + +^10 See Davis et al. (2016) and the references therein. + +--- + +## Page 29 + +**Figure 4: Changes to Urban Configurations Due to Productivity Increase** + +[Six panels of graphs showing changes over time (x-axis: Time, 0-15) for car-based and rail-based cities:] + +1. **Productivity**: Both car and rail increase from ~1.08 to ~1.095-1.10 over time. Car (solid line) and rail (dashed line) follow similar paths. + +2. **Population** (x10^4): Both increase. Rail rises more steeply (to ~9.5) than car (to ~8.5). + +3. **Wage**: Both increase from ~1.0. Car rises more steeply (to ~1.035) than rail (to ~1.015). + +4. **Land Rental Rate (Commercial)**: Both increase from ~2300. Car rises more (to ~2500+) than rail. + +5. **Land Rental Rate (CBD Residential)**: Both increase from ~2300. Car rises more than rail. + +6. **Total Land Rent** (x10^4): Both increase. Rail rises more steeply (to ~2.7) than car (to ~2.0). + +Note: This figure shows the changes over time in urban configurations due to a 1% increase in productivity under the scenario of flexible CBD and flexible city boundary, for both car-based and raise-based cities. Relative to car-based cities, rail-based cities feature higher fixed transportation cost, but lower gradient of transportation cost with respect to distance. In addition, rail-based cities have smaller congestion effect. + +**Figure 5: Transport Cost Before and After Productivity Increase** + +[Two panels:] + +1. **Transport Cost** (y-axis: 0.03-0.09, x-axis: Distance from CBD in km^2, 1-25): Shows transport cost as a function of distance. Car has steeper gradient than rail. After productivity increase (new SS), both shift upward slightly. + +2. **Rent Gradient** (y-axis: 1900-2500, x-axis: Distance from the CBD in km, 1-20): Shows rent declining with distance. Car (solid) has steeper decline than rail (dashed). New steady state lines show upward shift. Legend: car, car (new SS), rail, rail (new SS). + +Note: This figure shows the changes of transport cost due to a 1% increase in productivity and after the city converges to the new steady state equilibrium, under the scenario of flexible CBD and flexible city boundary. + +--- + +## Page 30 + +#### 4.2.3 Volatilities + +**Table 2: Percentage Changes Due to a 1% Increase In Productivity** + +| Transport | A | wage | N | P | Q | P | q | Size of CBD | Radius of City | +|-----------|-------|-------|--------|--------|--------|--------|--------|-------------|----------------| +| | | | | | | | | | | +| **open city; non-segmented CBD** | | | | | | | | | | +| Car | 2.236 | 2.212 | 49.98 | 50.43 | 53.30 | 7.57 | 7.565 | 42.52 | 22.76 | +| Rail | 2.987 | 2.935 | 91.46 | 91.92 | 97.12 | 10.20 | 10.20 | 78.88 | 38.67 | +| | | | | | | | | | | +| **open city; segmented CBD** | | | | | | | | | | +| Car | 2.111 | 1.941 | 44.021 | 44.452 | 46.82 | 6.62 | 46.82 | 0 | 20.03 | +| Rail | 2.746 | 2.478 | 76.993 | 77.525 | 81.38 | 8.50 | 81.38 | 0 | 32.97 | +| | | | | | | | | | | +| **closed city; non-segmented CBD** | | | | | | | | | | +| Car | 1.070 | 1.058 | 2.329 | 3.343 | 3.412 | 3.572 | 3.572 | -0.154 | 0 | +| Rail | 1.071 | 1.060 | 2.383 | 3.421 | 3.469 | 3.577 | 3.577 | -0.105 | 0 | +| | | | | | | | | | | +| **closed city; segmented CBD** | | | | | | | | | | +| Car | 1.070 | 1.059 | 2.329 | 3.345 | 3.413 | 3.574 | 3.413 | 0 | 0 | +| Rail | 1.071 | 1.061 | 2.384 | 3.423 | 3.469 | 3.579 | 3.469 | 0 | 0 | + +Note: Given a one percent increase in productivity, the city rises and converges to a new steady state equilibrium. This table shows the total changes (in percentage) in productivity, wage, population, total residential land rent, total commercial land rent, rental rate of residential land, rental rate of commercial land, size of the CBD and radius of the city. + +--- + +## Page 31 + +**Figure 6: City Volatility and Moving Cost** + +[Six panels showing simulation paths over time (x-axis: 0-150) arranged in two columns: delta=0% (left) and delta=1% (right)] + +**Top row - Wage:** +- delta=0%: Wage fluctuates between ~0.87-1.05, with car (solid) and rail (dashed) lines. Car shows a dramatic decline around time 100. +- delta=1%: Wage fluctuates between ~0.95-1.05, more stable. Both car and rail remain closer together. + +**Middle row - Population** (x10^6): +- delta=0%: Population fluctuates between 0-14 million. Car shows volatile swings with a collapse near time 100. Rail shows similar but less extreme pattern. +- delta=1%: Population is more stable, fluctuating between ~5-10 million. Rail (dashed) stays higher and more stable than car. + +**Bottom row - CBD Rent:** +- delta=0%: CBD rent fluctuates between ~1500-3000. Car shows volatile behavior with a collapse near time 100. +- delta=1%: CBD rent is more stable, fluctuating between ~2300-2800. Both car and rail show moderate fluctuations. + +Note: This figure shows simulation paths of wage, population and CBD rent with and without moving cost. diff --git a/documents/academic/phd_defense/approval_of_candidacy.md b/documents/academic/phd_defense/approval_of_candidacy.md new file mode 100644 index 0000000..35fcc0c --- /dev/null +++ b/documents/academic/phd_defense/approval_of_candidacy.md @@ -0,0 +1,69 @@ +--- +type: form +category: academic +person: Yanxin Lu +date: 2017 +source: approval_of_candidacy.pdf +--- + +# Approval of Candidacy for the Doctoral Degree + +**Date:** July 10, 2017 + +**Approved:** [Signature] + +The candidacy of **Yanxin Lu (ID # S01179519)**, Department of **Computer Science**, for the **Doctor of Philosophy** degree, is hereby approved. A timeline for defense of theses is located in the General Announcements. + +The Thesis Committee hereby appointed is: + +- a) Swarat Chaudhuri (Chair) +- b) Chris Jermaine +- c) Ankit Patel + +--- + +## The student's responsibilities relating to the oral defense are to: + +(1) Schedule the examination. You must be registered the semester you defend. + +(2) See General Announcements for requirements concerning the announcement of date, time, and place of the oral examination. + +(3) Submit to the online thesis submission site the signed original of this form within one week after the oral examination. Students who pass the oral examination defense of thesis on or before Friday of the first week of classes of any semester do not have to register for that or any subsequent semester even though minor revisions to the final copy may be continuing. Exemption from registration is dependent upon receipt of this properly signed form in the office of Graduate Studies. + +(4) You have six (6) months after the successful completion of your defense to submit your thesis documents online and to the office of Graduate Studies. + +*Additional responsibilities of the candidate are outlined in the thesis instructions and in the Rice General Announcements.* + +--- + +**Will you continue to register the semester after your defense?** No (checked) + +**I certify that all of the information listed on this form is correct.** +[Student signature] + +--- + +**Thesis Title (PLEASE HAND-PRINT LEGIBLY):** Corpus-Driven Systems for Program Synthesis and Refactoring + +## Report of the Thesis Committee: + +**The date of the oral defense of thesis was:** 11/19/18 + +The committee hereby certifies that the candidate passed. Signatures MUST be **ORIGINAL**. + +- (a) [Signature - Swarat Chaudhuri] +- (b) [Signature - Christopher Jermaine] +- (c) [Signature - Ankit Patel] +- (d) ___ +- (e) ___ +- (f) ___ + +**Remarks:** ___ + +--- + +## Graduate Office Record + +**Thesis Accepted:** ___ (Dean of Graduate and Postdoctoral Studies) + +**Date:** ___ diff --git a/documents/academic/phd_defense/approval_of_candidacy.pdf b/documents/academic/phd_defense/approval_of_candidacy.pdf new file mode 100644 index 0000000..4019550 Binary files /dev/null and b/documents/academic/phd_defense/approval_of_candidacy.pdf differ diff --git a/documents/academic/phd_defense/certificate_of_completion.md b/documents/academic/phd_defense/certificate_of_completion.md new file mode 100644 index 0000000..4bea2d1 --- /dev/null +++ b/documents/academic/phd_defense/certificate_of_completion.md @@ -0,0 +1,26 @@ +--- +type: certificate +category: academic +person: Yanxin Lu +date: 2019 +source: certificate_of_completion.pdf +--- + +# Certificate of Completion + +## of the + +## Survey of Earned Doctorates + +### Questionnaire + +This certifies that Yanxin Lu, +a student at Rice University - Graduate School, +has completed the Survey of Earned Doctorates +on 04/18/2019. + +**Confirmation Code:** 227757UTRH + +--- + +*Sponsored by: NSF, NIH, U.S. Department of Education, National Endowment for the Humanities* diff --git a/documents/academic/phd_defense/certificate_of_completion.pdf b/documents/academic/phd_defense/certificate_of_completion.pdf new file mode 100644 index 0000000..100cdb0 Binary files /dev/null and b/documents/academic/phd_defense/certificate_of_completion.pdf differ diff --git a/documents/academic/phd_defense/defense_evaluation_form.md b/documents/academic/phd_defense/defense_evaluation_form.md new file mode 100644 index 0000000..c99990c --- /dev/null +++ b/documents/academic/phd_defense/defense_evaluation_form.md @@ -0,0 +1,41 @@ +--- +type: form +category: academic +person: Yanxin Lu +date: 2018 +source: defense_evaluation_form.pdf +--- + +# Department of Computer Science +# Rice University +## Evaluation of Ph.D. Thesis and Defense + +**Student name:** Yanxin Lu + +**Semester entered program:** Fall 2012 + +**Date of examination:** Mon., 11/19/18 + +Please review the guidelines for evaluation on the reverse side of this form, and choose exactly one of the rankings for each criterion. + +| Criterion | Excellent | Satisfactory | Unsatisfactory | +|---|---|---|---| +| Problem statement | | X | | +| Impact of work | X | | | +| Technique | | X | | +| Content of results | | X | | +| Thesis text | | X | | +| Oral Presentation | | X | | +| Replies to questions | | X | | + +## Summary Evaluation + +I have read this thesis / proposal and recommend: **X award Ph.D.** + +- [ ] deny Ph.D. + +**Comments:** +1. Modify thesis in consultation with Prof. Chaudhuri +2. Expand & submit work on API refactoring + +**Signatures:** [Three signatures] diff --git a/documents/academic/phd_defense/defense_evaluation_form.pdf b/documents/academic/phd_defense/defense_evaluation_form.pdf new file mode 100644 index 0000000..2460ec0 Binary files /dev/null and b/documents/academic/phd_defense/defense_evaluation_form.pdf differ diff --git a/documents/academic/phd_defense/defense_evaluation_guidelines.md b/documents/academic/phd_defense/defense_evaluation_guidelines.md new file mode 100644 index 0000000..b3e533b --- /dev/null +++ b/documents/academic/phd_defense/defense_evaluation_guidelines.md @@ -0,0 +1,53 @@ +--- +type: form +category: academic +person: Yanxin Lu +date: 2018 +source: defense_evaluation_guidelines.pdf +--- + +# Department of Computer Science +# Rice University +## Guidelines for Evaluating Ph.D. Thesis and Defense + +### Problem statement + +- **Excellent:** Compelling problem statement that demonstrates the challenge and utility of the work, as well as theoretical or practical applications +- **Satisfactory:** Problem is clearly stated; a case is made for utility, application +- **Unsatisfactory:** Problem is not clearly stated; little context or justification + +### Impact of work + +- **Excellent:** Groundbreaking work or a novel problem; a thesis that will change the literature +- **Satisfactory:** Solves an important or novel problem; quality of work merits publication in important venues +- **Unsatisfactory:** Obvious extension to the existing literature + +### Technique + +- **Excellent:** Develops new approach to solution or applies techniques that are novel to the area +- **Satisfactory:** Uses established techniques to solve novel problems +- **Unsatisfactory:** Uses techniques incorrectly or inappropriately + +### Content of results + +- **Excellent:** Computer Science content is substantial and correct +- **Satisfactory:** Content has acceptable depth and breadth & requires only minor corrections +- **Unsatisfactory:** Content is shallow and/or contains significant errors + +### Thesis text + +- **Excellent:** Well organized text, fluent prose, and few grammatical errors +- **Satisfactory:** Acceptable organization & text, limited grammatical errors +- **Unsatisfactory:** Poor organization, difficult prose, or numerous grammatical errors + +### Oral Presentation + +- **Excellent:** Engaging, polished presentation with well crafted visual aides that illustrate key results; includes a substantial conclusion +- **Satisfactory:** Professional presentation on a par with a solid conference talk; includes a coherent project narrative and conclusion +- **Unsatisfactory:** Too much or too little detail; unclear about project goals and direction; incoherent slides; candidate reads from slides + +### Replies to questions + +- **Excellent:** Complete answers that demonstrate a deep understanding of the discipline that extends beyond the thesis +- **Satisfactory:** Competent answers that illustrate a facility with the issues and techniques immediately relevant to the thesis +- **Unsatisfactory:** Answers reveal a limited comprehension of the work and its context diff --git a/documents/academic/phd_defense/defense_evaluation_guidelines.pdf b/documents/academic/phd_defense/defense_evaluation_guidelines.pdf new file mode 100644 index 0000000..83b0113 Binary files /dev/null and b/documents/academic/phd_defense/defense_evaluation_guidelines.pdf differ diff --git a/documents/academic/phd_defense/signed_title_page.md b/documents/academic/phd_defense/signed_title_page.md new file mode 100644 index 0000000..1119026 --- /dev/null +++ b/documents/academic/phd_defense/signed_title_page.md @@ -0,0 +1,43 @@ +--- +type: thesis +category: academic +person: Yanxin Lu +date: 2019 +source: signed_title_page.pdf +--- + +# RICE UNIVERSITY + +## Corpus-Driven Systems for Program Synthesis and Refactoring + +by + +### Yanxin Lu + +A Thesis Submitted +in Partial Fulfillment of the +Requirements for the Degree + +### Doctor of Philosophy + +--- + +Approved, Thesis Committee: + +**Swarat Chaudhuri, Chair** +Associate Professor of Computer Science +[Signature] + +**Christopher Jermaine** +Professor of Computer Science +[Signature] + +**Ankit B. Patel** +Assistant Professor of Electrical and Computer Engineering +[Signature] + +--- + +Houston, Texas + +April, 2019 diff --git a/documents/academic/phd_defense/signed_title_page.pdf b/documents/academic/phd_defense/signed_title_page.pdf new file mode 100644 index 0000000..43946a5 Binary files /dev/null and b/documents/academic/phd_defense/signed_title_page.pdf differ diff --git a/documents/academic/phd_defense/thesis_final.md b/documents/academic/phd_defense/thesis_final.md new file mode 100644 index 0000000..310d58e --- /dev/null +++ b/documents/academic/phd_defense/thesis_final.md @@ -0,0 +1,113 @@ +--- +type: thesis +category: academic +person: Yanxin Lu +date: 2019 +source: thesis_final.pdf +--- + +# RICE UNIVERSITY + +## Corpus-Driven Systems for Program Synthesis and Refactoring + +by + +### Yanxin Lu + +A Thesis Submitted +in Partial Fulfillment of the +Requirements for the Degree + +### Doctor of Philosophy + +--- + +Approved, Thesis Committee: + +**Swarat Chaudhuri, Chair** +Associate Professor of Computer Science +[Signature] + +**Christopher Jermaine** +Professor of Computer Science +[Signature] + +**Ankit B. Patel** +Assistant Professor of Electrical and Computer Engineering +[Signature] + +Houston, Texas + +April, 2019 + +--- + +## Abstract + +### Corpus-Driven Systems for Program Synthesis and Refactoring + +by + +Yanxin Lu + +Software development is a difficult task. Programmers need to work with many small components in large software projects which typically contain more than thousands of lines of code. To make software development manageable, developers and researchers have deployed various programming systems and tools. These include the ones that can facilitate refactoring existing source code and even generate programs automatically. One problem with traditional program synthesis tools is that they cannot generate practical results when given large specifications due to its high complexity of the underlying problem. Furthermore, existing refactoring systems can only refactor individual components separately and fail to instantiate complete programs. To overcome these problems, we can learn useful patterns and idioms from large code corpora using machine learning techniques. Researchers have used "big code" and developed novel and practical programming tools such as Bayou [1] and JSNice [2]. In this thesis, we present two data-driven programming systems for software reuse and refactoring. + +We first introduce *program splicing*, a programming methodology that aims to automate the workflow of copying, pasting, and modifying code available online. Here, the programmer starts by writing a "draft" that mixes unfinished code, natural language comments, and correctness requirements. A program synthesizer that interacts with a large, searchable database of program snippets is used to automatically complete the draft into a program that meets the requirements. Our evaluation uses the system in a suite of everyday programming tasks and includes a comparison with a state-of-the-art competing approach as well as a user study. The results point to the broad scope and scalability of program splicing and indicate that the approach can significantly boost programmer productivity. + +Next, we propose an algorithm that automates the process of *API refactoring*, where the goal is to rewrite an API call sequence into another sequence that only uses the API calls defined in the target library without modifying the functionality. We solve the problem of API refactoring by combining the techniques of API translation and API sequence synthesis. Specifically, we first translate original API calls into a set of new API calls defined in the target library. Then we use an API synthesizer to generate a complete program that uses the translated API calls. We evaluated our algorithm on a diverse set of benchmark problems, and our algorithm can refactor API sequences with high accuracy. + +Although the evaluations of the techniques presented in this thesis are quite optimistic, we believe that there is room for improvement by using more sophisticated language model and advanced search algorithm for program splicing. To improve our API refactoring method, one can train statistical models by using existing API call sequence pairs. Besides these potential improvements, many problems related to "big code" still remain, and the potential of using a data-driven method to help programming is enormous. + +--- + +## Contents + +- Abstract ... ii +- List of Illustrations ... vi +- List of Tables ... viii + +### 1 Introduction ... 1 +- 1.1 Program reuse via splicing ... 6 +- 1.2 API refactoring using natural language and API synthesizer ... 8 +- 1.3 Summary ... 11 + +### 2 Program Splicing ... 12 +- 2.1 Introduction ... 12 +- 2.2 Motivating Examples ... 14 + - 2.2.1 Reading a Matrix from a CSV File ... 14 + - 2.2.2 Face Detection using OpenCV ... 19 +- 2.3 Problem formulation ... 21 +- 2.4 Method ... 24 + - 2.4.1 Searching for programs ... 24 + - 2.4.2 Program completion ... 26 +- 2.5 Evaluation ... 31 + - 2.5.1 Benchmarks ... 32 + - 2.5.2 Experiments ... 35 +- 2.6 Summary ... 45 + +### 3 API Refactoring ... 46 +- 3.1 Introduction ... 46 +- 3.2 Motivating Examples ... 49 +- 3.3 Problem Definition ... 54 +- 3.4 Method ... 55 + - 3.4.1 API Translation ... 56 + - 3.4.2 API Call Sequence Synthesis ... 58 +- 3.5 Evaluation ... 62 + - 3.5.1 Benchmarks ... 62 + - 3.5.2 Experiments ... 63 + - 3.5.3 Limitations ... 69 +- 3.6 Summary ... 69 + +### 4 Related Work ... 71 +- 4.1 Program Synthesis and Reuse ... 71 +- 4.2 Data-driven Program Synthesis ... 74 +- 4.3 Code Search ... 77 +- 4.4 API Refactoring and Translation ... 81 + +### 5 Conclusion and Future Work ... 85 + +### Bibliography ... 89 + +--- + +*Note: This is the final signed version of the thesis (1.6MB). The full thesis contains 95+ pages of technical content including figures, tables, algorithms, code examples, experimental results, and bibliography. The complete content is preserved in the PDF.* diff --git a/documents/academic/phd_defense/thesis_final.pdf b/documents/academic/phd_defense/thesis_final.pdf new file mode 100644 index 0000000..0a312aa Binary files /dev/null and b/documents/academic/phd_defense/thesis_final.pdf differ diff --git a/documents/academic/phd_defense/thesis_main.md b/documents/academic/phd_defense/thesis_main.md new file mode 100644 index 0000000..776ba6e --- /dev/null +++ b/documents/academic/phd_defense/thesis_main.md @@ -0,0 +1,110 @@ +--- +type: thesis +category: academic +person: Yanxin Lu +date: 2019 +source: thesis_main.pdf +--- + +# RICE UNIVERSITY + +## Corpus-Driven Systems for Program Synthesis and Refactoring + +by + +### Yanxin Lu + +A Thesis Submitted +in Partial Fulfillment of the +Requirements for the Degree + +### Doctor of Philosophy + +--- + +Approved, Thesis Committee: + +**Swarat Chaudhuri, Chair** +Associate Professor of Computer Science + +**Christopher Jermaine** +Professor of Computer Science + +**Ankit B. Patel** +Assistant Professor of Electrical and Computer Engineering + +Houston, Texas + +April, 2019 + +--- + +## Abstract + +### Corpus-Driven Systems for Program Synthesis and Refactoring + +by + +Yanxin Lu + +Software development is a difficult task. Programmers need to work with many small components in large software projects which typically contain more than thousands of lines of code. To make software development manageable, developers and researchers have deployed various programming systems and tools. These include the ones that can facilitate refactoring existing source code and even generate programs automatically. One problem with traditional program synthesis tools is that they cannot generate practical results when given large specifications due to its high complexity of the underlying problem. Furthermore, existing refactoring systems can only refactor individual components separately and fail to instantiate complete programs. To overcome these problems, we can learn useful patterns and idioms from large code corpora using machine learning techniques. Researchers have used "big code" and developed novel and practical programming tools such as Bayou [1] and JSNice [2]. In this thesis, we present two data-driven programming systems for software reuse and refactoring. + +We first introduce *program splicing*, a programming methodology that aims to automate the workflow of copying, pasting, and modifying code available online. Here, the programmer starts by writing a "draft" that mixes unfinished code, natural language comments, and correctness requirements. A program synthesizer that interacts with a large, searchable database of program snippets is used to automatically complete the draft into a program that meets the requirements. Our evaluation uses the system in a suite of everyday programming tasks and includes a comparison with a state-of-the-art competing approach as well as a user study. The results point to the broad scope and scalability of program splicing and indicate that the approach can significantly boost programmer productivity. + +Next, we propose an algorithm that automates the process of *API refactoring*, where the goal is to rewrite an API call sequence into another sequence that only uses the API calls defined in the target library without modifying the functionality. We solve the problem of API refactoring by combining the techniques of API translation and API sequence synthesis. Specifically, we first translate original API calls into a set of new API calls defined in the target library. Then we use an API synthesizer to generate a complete program that uses the translated API calls. We evaluated our algorithm on a diverse set of benchmark problems, and our algorithm can refactor API sequences with high accuracy. + +Although the evaluations of the techniques presented in this thesis are quite optimistic, we believe that there is room for improvement by using more sophisticated language model and advanced search algorithm for program splicing. To improve our API refactoring method, one can train statistical models by using existing API call sequence pairs. Besides these potential improvements, many problems related to "big code" still remain, and the potential of using a data-driven method to help programming is enormous. + +--- + +## Contents + +- Abstract ... ii +- List of Illustrations ... vi +- List of Tables ... viii + +### 1 Introduction ... 1 +- 1.1 Program reuse via splicing ... 6 +- 1.2 API refactoring using natural language and API synthesizer ... 8 +- 1.3 Summary ... 11 + +### 2 Program Splicing ... 12 +- 2.1 Introduction ... 12 +- 2.2 Motivating Examples ... 14 + - 2.2.1 Reading a Matrix from a CSV File ... 14 + - 2.2.2 Face Detection using OpenCV ... 19 +- 2.3 Problem formulation ... 21 +- 2.4 Method ... 24 + - 2.4.1 Searching for programs ... 24 + - 2.4.2 Program completion ... 26 +- 2.5 Evaluation ... 31 + - 2.5.1 Benchmarks ... 32 + - 2.5.2 Experiments ... 35 +- 2.6 Summary ... 45 + +### 3 API Refactoring ... 46 +- 3.1 Introduction ... 46 +- 3.2 Motivating Examples ... 49 +- 3.3 Problem Definition ... 54 +- 3.4 Method ... 55 + - 3.4.1 API Translation ... 56 + - 3.4.2 API Call Sequence Synthesis ... 58 +- 3.5 Evaluation ... 62 + - 3.5.1 Benchmarks ... 62 + - 3.5.2 Experiments ... 63 + - 3.5.3 Limitations ... 69 +- 3.6 Summary ... 69 + +### 4 Related Work ... 71 +- 4.1 Program Synthesis and Reuse ... 71 +- 4.2 Data-driven Program Synthesis ... 74 +- 4.3 Code Search ... 77 +- 4.4 API Refactoring and Translation ... 81 + +### 5 Conclusion and Future Work ... 85 + +### Bibliography ... 89 + +--- + +*Note: This is the unsigned/pre-defense version of the thesis (963KB). The full thesis contains 95+ pages of technical content including figures, tables, algorithms, code examples, experimental results, and bibliography. The complete content is preserved in the PDF.* diff --git a/documents/academic/phd_defense/thesis_main.pdf b/documents/academic/phd_defense/thesis_main.pdf new file mode 100644 index 0000000..0b85286 Binary files /dev/null and b/documents/academic/phd_defense/thesis_main.pdf differ diff --git a/documents/academic/phd_defense/thesis_submittal_confirmation.md b/documents/academic/phd_defense/thesis_submittal_confirmation.md new file mode 100644 index 0000000..807f3a5 --- /dev/null +++ b/documents/academic/phd_defense/thesis_submittal_confirmation.md @@ -0,0 +1,53 @@ +--- +type: form +category: academic +person: Yanxin Lu +date: 2018 +source: thesis_submittal_confirmation.pdf +--- + +# Thesis and Dissertation Submission + +**URL:** https://thesis.rice.edu/submit/2787/complete + +**Date:** 11/19/18, 15:04 + +**User:** Yanxin Lu + +--- + +Home / Submissions / Complete + +# Submittal Complete + +Thank you for uploading your thesis via Rice University's Thesis and Dissertation Management System. + +If you have not already done so, you may log back into this site at any time to submit your final, corrected, signed thesis as well as your administrative documents: + +- Scan of your signed Approval of Candidacy form +- Survey of Earned Doctorates Completion Certificate (doctoral degrees only) + +After you have done so, please bring the following documents in hard copy to Allen Center 323 between 1:30 - 3:30 Monday-Friday: + +- Original Approval of Candidacy form +- Two signed, original copies of your thesis' title page + +You are welcome to ask someone to deliver these documents on your behalf. If you are unable to submit your forms during these hours, please email graduate@rice.edu to request an appointment. Hours will extend during the two weeks prior to each semester's deadline for thesis submission. + +Please be advised that your file is not complete and cannot be reviewed until we have your uploaded thesis, your administrative files, and the signed original documents. + +After reading the instructions on our website, http://graduate.rice.edu/submitthesis, please contact your department coordinator or graduate@rice.edu if you have any questions. + +If you need a document confirming your submission for employment purposes, please email graduate@rice.edu with your request. + +Best, + +The Office of Graduate and Postdoctoral Studies and the +Center for Digital Scholarship +Rice University + +View submission status + +--- + +*Instructions | FAQ | Managed by Rice GPS and Fondren Library | Contact Us* diff --git a/documents/academic/phd_defense/thesis_submittal_confirmation.pdf b/documents/academic/phd_defense/thesis_submittal_confirmation.pdf new file mode 100644 index 0000000..c3e917f Binary files /dev/null and b/documents/academic/phd_defense/thesis_submittal_confirmation.pdf differ diff --git a/documents/academic/rice_engi601/lu_90_second_talk.md b/documents/academic/rice_engi601/lu_90_second_talk.md new file mode 100644 index 0000000..58ff955 --- /dev/null +++ b/documents/academic/rice_engi601/lu_90_second_talk.md @@ -0,0 +1,15 @@ +--- +type: presentation +category: academic +person: Yanxin Lu +date: 2018 +source: lu_90-second-talk.pdf +--- + +# Script for 90-Second Talk + +**ENGI 601** +**Yanxin Lu** +**Sunday, September 9, 2018** + +To produce a good piece of article, rewriting is important. To have a high-quality and long-lasting computer program, we need refactoring to keep programs clean and updated, because obsolete programs are typically unusable. However, program refactoring is difficult, because hundreds of components are intertwined with each other and moving one piece can easily break the entire program. To make refactoring less error-prone, I developed a method that automates the process of software refactoring and specifically I focus on dealing with refactoring Application Programming Interface call sequences, or API call sequences. First, I translate the input API calls into a set of individual API calls defined in the domain specified by the user using a statistical model called word2vec model, and then I put the translated API calls together and generate multiple complete candidate API call sequences using an API call sequence synthesizer. I applied my method on various refactoring tasks and most of the time I was able to achieve high accuracy. Given this result, I believe that my method can save most programmers' time, especially for those who needs to refactor many programs. diff --git a/documents/academic/rice_engi601/lu_90_second_talk.pages b/documents/academic/rice_engi601/lu_90_second_talk.pages new file mode 100644 index 0000000..218d02d Binary files /dev/null and b/documents/academic/rice_engi601/lu_90_second_talk.pages differ diff --git a/documents/academic/rice_engi601/lu_90_second_talk.pdf b/documents/academic/rice_engi601/lu_90_second_talk.pdf new file mode 100644 index 0000000..35ff958 Binary files /dev/null and b/documents/academic/rice_engi601/lu_90_second_talk.pdf differ diff --git a/documents/academic/rice_engi601/lu_90_second_talk_pages.md b/documents/academic/rice_engi601/lu_90_second_talk_pages.md new file mode 100644 index 0000000..b9b5f59 --- /dev/null +++ b/documents/academic/rice_engi601/lu_90_second_talk_pages.md @@ -0,0 +1,11 @@ +--- +type: presentation +category: academic +person: Yanxin Lu +date: 2018 +source: 90-second-talk.pages +--- + +# 90-Second Talk Script + +This file is in Apple Pages format (.pages) and cannot be rendered as text. See the companion PDF `lu_90_second_talk.pdf` for the readable version of this document. diff --git a/documents/academic/rice_engi601/lu_api_refactoring.key b/documents/academic/rice_engi601/lu_api_refactoring.key new file mode 100644 index 0000000..11750d0 Binary files /dev/null and b/documents/academic/rice_engi601/lu_api_refactoring.key differ diff --git a/documents/academic/rice_engi601/lu_api_refactoring.md b/documents/academic/rice_engi601/lu_api_refactoring.md new file mode 100644 index 0000000..54a6803 --- /dev/null +++ b/documents/academic/rice_engi601/lu_api_refactoring.md @@ -0,0 +1,11 @@ +--- +type: presentation +category: academic +person: Yanxin Lu +date: 2018 +source: api-refactoring.key +--- + +# API Refactoring Presentation + +This file is in Apple Keynote format (.key) and cannot be rendered as text. See the companion PDF `lu_slides.pdf` for the readable version of this presentation. diff --git a/documents/academic/rice_engi601/lu_engi601_poster.key b/documents/academic/rice_engi601/lu_engi601_poster.key new file mode 100644 index 0000000..41f0a05 Binary files /dev/null and b/documents/academic/rice_engi601/lu_engi601_poster.key differ diff --git a/documents/academic/rice_engi601/lu_engi601_poster.pptx b/documents/academic/rice_engi601/lu_engi601_poster.pptx new file mode 100644 index 0000000..7d7bec8 Binary files /dev/null and b/documents/academic/rice_engi601/lu_engi601_poster.pptx differ diff --git a/documents/academic/rice_engi601/lu_engi601_poster_key.md b/documents/academic/rice_engi601/lu_engi601_poster_key.md new file mode 100644 index 0000000..f7ddd72 --- /dev/null +++ b/documents/academic/rice_engi601/lu_engi601_poster_key.md @@ -0,0 +1,11 @@ +--- +type: poster +category: academic +person: Yanxin Lu +date: 2018 +source: engi601_poster.key +--- + +# ENGI 601 Poster (Keynote) + +This file is in Apple Keynote format (.key) and cannot be rendered as text. See the companion PDF `lu_poster.pdf` for the readable version of this poster. diff --git a/documents/academic/rice_engi601/lu_engi601_poster_pptx.md b/documents/academic/rice_engi601/lu_engi601_poster_pptx.md new file mode 100644 index 0000000..ab3ac4f --- /dev/null +++ b/documents/academic/rice_engi601/lu_engi601_poster_pptx.md @@ -0,0 +1,11 @@ +--- +type: poster +category: academic +person: Yanxin Lu +date: 2018 +source: engi601_poster.pptx +--- + +# ENGI 601 Poster (PowerPoint) + +This file is in Microsoft PowerPoint format (.pptx) and cannot be rendered as text. See the companion PDF `lu_poster.pdf` for the readable version of this poster. diff --git a/documents/academic/rice_engi601/lu_poster.md b/documents/academic/rice_engi601/lu_poster.md new file mode 100644 index 0000000..81fa630 --- /dev/null +++ b/documents/academic/rice_engi601/lu_poster.md @@ -0,0 +1,103 @@ +--- +type: poster +category: academic +person: Yanxin Lu +date: 2018 +source: lu_poster.pdf +--- + +# Corpus-Driven API Refactoring + +**Yanxin Lu, Swarat Chaudhuri, Christopher Jermaine** +*Department of Computer Science, Rice University* + +## Introduction + +- Program rewrite or refactoring improves software maintainability. +- Application programming interface (API) plays key role in everyday programming. +- Automatically refactor an API call sequence +- Translate the input API calls +- Synthesize complete API call sequence + +### Code Example (Before - HtmlCleaner) + +```java +HtmlCleaner cleaner = new HtmlCleaner(); +TagNode node = cleaner.clean(content); +TagNode[] links = node.getElementsHavingAttribu... +TagNode link = links[0]; +String href = link.getAttributeByName(attr); +``` + +### Code Example (After - Jsoup) + +```java +Document doc = Jsoup.parse(content); +Elements links = doc.select(selector); +Element link = links.first(); +String href = link.attr(attr); +``` + +## Methods + +- Translate the input API calls +- Synthesize complete API call sequence + +### Algorithm Diagram + +- A() --> a() --> a() +- B() --> b() --> b() +- C() (API translation) --> c() (API synthesis) --> c() +- D() --> d() --> d() +- E() --> e() --> e() + +## Main Results + +- Refactoring accuracy on various input API call sequences +- Accuracy: percentage of correct generated API calls + +### Accuracy Chart + +Bar chart showing "Accuracy w/o params" and "Accuracy" for the following benchmark tasks: + +CSV read, CSV write, CSV database, CSV delimiter, email login, email check, email send, email delete, FTP list, FTP login, FTP upload, FTP download, FTP delete, HTML scraping, HTML add node, HTML rm attr, HTML parse, HTML title, HTML write, HTTP get, HTTP post, HTTP server, NLP sentence, NLP token, NLP tag, NLP stem, ML classification, ML regression, ML cluster, ML neural network, graphics, gui, pdf read, pdf write, word read, word write + +## Limitations + +Our refactoring method might not work as expected: + +- Inaccurate API translation + - HTML Writing + - Word Reading/Writing + - GUI +- Long input API sequence + - Sending Email + - PDF Writing + +### Limitation Diagram + +- A() --> x() +- B() --> y() +- C() (translation) --> z() +- D() +- E() +- F() +- G() + +## Conclusion + +- Effective method that automates the process of API refactoring +- Combination of two techniques + - API call translation + - API call sequence synthesizer +- Does not work when + - Terminologies are different + - Input sequence is too long + +## Bibliography + +- Amruta Gokhale, Vinod Ganapathy, and Yogesh Padmanaban. Inferring likely mappings between apis. In Proceedings of the 2013 International Conference on Software Engineering, pages 82-91. IEEE Press, 2013. +- Amruta Gokhale, Daeyoung Kim, and Vinod Ganapathy. Data-driven inference of api mappings. In Proceedings of the 2nd Workshop on Programming for Mobile & Touch, pages 29-32. ACM, 2014. +- Vijayaraghavan Murali, Letao Qi, Swarat Chaudhuri, and Chris Jermaine. Neural sketch learning for conditional program generation. arXiv preprint arXiv:1703.05698, 2017. +- Rahul Pandita, Raoul Praful Jetley, Sithu D Sudarsan, and Laurie Williams. Discovering likely mappings between apis using text mining. In Source Code Analysis and Manipulation (SCAM), 2015 IEEE 15th International Working Conference on, pages 231-240. IEEE, 2015. +- Trong Duc Nguyen, Anh Tuan Nguyen, and Tien N Nguyen. Mapping api elements for code migration with vector representations. In Software Engineering Companion (ICSE-C), IEEE/ACM International Conference on, pages 756-758. IEEE, 2016. diff --git a/documents/academic/rice_engi601/lu_poster.pdf b/documents/academic/rice_engi601/lu_poster.pdf new file mode 100644 index 0000000..3f35152 Binary files /dev/null and b/documents/academic/rice_engi601/lu_poster.pdf differ diff --git a/documents/academic/rice_engi601/lu_slides.md b/documents/academic/rice_engi601/lu_slides.md new file mode 100644 index 0000000..3d28ce7 --- /dev/null +++ b/documents/academic/rice_engi601/lu_slides.md @@ -0,0 +1,313 @@ +--- +type: presentation +category: academic +person: Yanxin Lu +date: 2018 +source: lu_slides.pdf +--- + +# API Refactoring Using Natural Language and Program Synthesis + +**Yanxin Lu, Rice University** +**Swarat Chaudhuri, Rice University** +**Christopher Jermaine, Rice University** + +--- + +## Slide 1: Title Slide + +(blank/title page) + +--- + +## Slide 2: Title + +API refactoring using natural language and program synthesis + +Yanxin Lu, Rice University +Swarat Chaudhuri, Rice University +Christopher Jermaine, Rice University + +--- + +## Slide 3: Software Refactoring + +- Library/platform upgrade +- Obsolete code reuse + +### Example (Before - SSHJ): + +```java +SSHClient ssh = new SSHClient(); +SFTPClient ftp = ssh.newSFTPClient(); +ssh.authPassword(username, password); +ssh.connect(host); +ftp.ls(path); +ftp.close(); +``` + +(Arrow: API refactoring) + +### Example (After - Apache): + +```java +FTPClient f = new FTPClient(); +f.connect(host); +f.login(username, password); +FTPFile[] files = f.listFiles(path); +f.disconnect(); +``` + +- Almost as hard as coding + +--- + +## Slide 4: Problem + +Can we automate the process of API refactoring using program synthesis? + +--- + +## Slide 5: Contribution + +- Combination of two existing techniques + - API translation + - Natural language + - API sequence synthesis + - Complete API sequence + - Bayou + +--- + +## Slide 6: Related Work + +- API mapping + - Natural language + - Sequence alignment +- API sequence synthesis + - Learning from the web (SWIM) + - Bayou + +References: +- Raghothaman, Mukund, Yi Wei, and Youssef Hamadi. "SWIM: Synthesizing What I Mean-Code Search and Idiomatic Snippet Synthesis." *Software Engineering (ICSE), 2016 IEEE/ACM 38th International Conference on.* IEEE, 2016. +- Murali, Vijayaraghavan, et al. "Neural Sketch Learning for Conditional Program Generation." *arXiv preprint arXiv:1703.05698* (2017). + +--- + +## Slide 7: Algorithm + +- A() --> (API translation) --> a() --> (API synthesis) --> a() +- B() --> b() --> b() +- C() --> c() --> c() +- D() --> d() --> d() +- E() --> e() --> e() + +--- + +## Slide 8: Algorithm (Highlighted: API translation) + +Same diagram as Slide 7, with the "API translation" step highlighted in a red box. + +--- + +## Slide 9: API Translation + +### Architecture: + +1. All relevant libraries and Java 8 --> Text extraction --> Javadoc cards (e.g., "clean / Parse an html document string", "isValid / Test if the input body HTML has only tags and attributes ...") +2. Train a word2vec model +3. Input API calls: A(), B(), C(), D(), E() --> Translator --> Output: a(), b(), c(), d(), e() + +--- + +## Slide 10: Word2Vec Model + +- Captures some degree of semantic information + +| Query word | Similar words | +|---|---| +| int | integer, float, long, double, short | +| ftp | nntp, smtp, secret, pixmap, out-of-synch | +| button | rollover, radio, tooltip, checkbox, click | +| index | IndexFrom, MenuIndex, ListIndex, occurrence, nth | +| stream | InputStream, StreamB, BufferTest, console, AccessFile | +| image | gif, animation, texture, BufferedImage, RenderedImage | +| email | bcc, recipient, sender, addresse, mail | +| vector | scalar, dense, product, kernel, matrix | + +--- + +## Slide 11: Pair-wise API Similarities + +Bipartite graph between Apache and SSHJ APIs: + +Apache side: connect, login, list, close +SSHJ side: auth, connect, ls, disconnect + +Lines connecting each Apache API to each SSHJ API (showing pair-wise similarity scores), with thicker lines indicating stronger matches (e.g., connect-connect, login-auth, list-ls, close-disconnect). + +--- + +## Slide 12: API Similarity + +Comparing two APIs: + +Left API: +- Name: clean +- Return: TagNode +- Description: "Parse an html document string" + +Right API: +- Name: parse +- Return: Document +- Description: "Parse HTML into a Document." + +Similarity scores between components: +- Name similarity: 0.7 +- Return type similarity: 0.3 +- Description similarity: 0.8 + +- Similarity = w1 * 0.5 + w2 * 0.3 + w3 * 0.8 +- More weight on description similarity +- Word list similarity - bipartite matching + +--- + +## Slide 13: Algorithm (Highlighted: API synthesis) + +Same diagram as Slide 7, with the "API synthesis" step highlighted in a red box. + +--- + +## Slide 14: API Sequence Synthesis + +- Input: APIs in the target library + - Stand-alone APIs + - Might miss a few APIs +- Output: a complete API call sequence +- Bayou + +Reference: Murali, Vijayaraghavan, et al. "Neural Sketch Learning for Conditional Program Generation." *arXiv preprint arXiv:1703.05698* (2017). + +--- + +## Slide 15: API Sequence Synthesis (with Bayou example) + +- Input: APIs in the target library + - Stand-alone APIs + - Might miss a few APIs +- Output: a complete API call sequence +- Bayou + +### Bayou Input (evidence): + +``` +void read_href(String content, + String selector, + String attr, + Evaluator _arg01) { + ///call:parse type:Jsoup call:select + ///call:first +} +``` + +### Bayou Output: + +```java +void read_href(String content, + String selector, + String attr, + Evaluator _arg01) { + Elements e1; + Document d1; + Elements e3; + Element e2; + d1 = Jsoup.parse(content); + e1 = d1.select(selector); + e2 = e1.first(); + e3 = Collector.collect(_arg01, e2); +} +``` + +Evidence: types, calls + +--- + +## Slide 16: Bayou + +### Architecture: + +1. Code corpus --> Evidence extraction --> Evidence/code pairs (e.g., "call:readLine type:FileReader type:BufferedReader", "type:Iterator call:next call:remove") + +Example extracted code: + +```java +void remove(List list) { + Iterator i1; + boolean b1; + i1 = list.iterator(); + while ((b1 = i1.hasNext())) { + i1.remove(); + } + return; +} +``` + +2. Training (neural network with distribution curve) + +3. Input evidence: `call:parse type:Jsoup call:select call:first` --> Trained model --> Output: + +```java +void read_href(String content, + String selector, + String attr, + Evaluator _arg01) { + Elements e1; + Document d1; + Elements e3; + Element e2; + d1 = Jsoup.parse(content); + e1 = d1.select(selector); + e2 = e1.first(); + e3 = Collector.collect(_arg01, e2); +} +``` + +--- + +## Slide 17: Evaluation + +- Accuracy - percentage of correctly generated API calls +- 75% accuracy on most benchmark problems + +Bar chart showing "Accuracy w/o params" and "Accuracy" for benchmark tasks: + +CSV read, CSV write, CSV database, CSV delimiter, email login, email check, email send, email delete, FTP list, FTP login, FTP upload, FTP download, FTP delete, HTML scraping, HTML add node, HTML rm attr, HTML parse, HTML title, HTML write, HTTP get, HTTP post, HTTP server, NLP sentence, NLP token, NLP tag, NLP stem, ML classification, ML regression, ML cluster, ML neural network, graphics, gui, pdf read, pdf write, word read, word write + +--- + +## Slide 18: Translation Failure + +Top chart: Same accuracy bar chart as Slide 17, with red boxes highlighting problem areas: HTML title, HTML write, gui, pdf read, pdf write, word read, word write. + +Bottom chart: "Translation" accuracy at different levels (Translation-1, Translation-3, Translation-5) for the same benchmark tasks, showing that translation accuracy is the bottleneck for the highlighted tasks. + +--- + +## Slide 19: Rare Sequence + +Top chart: Same accuracy bar chart, with red boxes highlighting: email send, HTML scraping, gui, pdf read, pdf write, word read, word write. + +Bottom chart: "Min Bayou calls" for each benchmark task, showing that tasks with rare sequences (highlighted in red) have fewer matching Bayou training examples, leading to lower accuracy. + +--- + +## Slide 20: API Refactoring (Conclusion) + +- Effective method that automates the process of API refactoring +- Combination of two techniques + - API call translation + - API call sequence synthesizer +- Does not work when + - Terminologies are different + - Rare sequence diff --git a/documents/academic/rice_engi601/lu_slides.pdf b/documents/academic/rice_engi601/lu_slides.pdf new file mode 100644 index 0000000..ed3bc58 Binary files /dev/null and b/documents/academic/rice_engi601/lu_slides.pdf differ diff --git a/documents/academic/rice_engi601/lu_writing.pdf b/documents/academic/rice_engi601/lu_writing.pdf new file mode 100644 index 0000000..166182d Binary files /dev/null and b/documents/academic/rice_engi601/lu_writing.pdf differ diff --git a/documents/academic/rice_engi601/lu_writing.tex b/documents/academic/rice_engi601/lu_writing.tex new file mode 100644 index 0000000..1d0f42a --- /dev/null +++ b/documents/academic/rice_engi601/lu_writing.tex @@ -0,0 +1,287 @@ +\chapter{Introduction} +\label{ch:intro} +With the advancement in technologies such as artificial intelligence +and also the expansions of high-tech companies, computer programming +starts to become an important skill, and the demand for programmers +has been growing dramatically in the past few years. The overall +productivity has been boosted significantly thanks to the increasing +number of programmers, but we still have not witnessed any boost in +individual programming productivity. + +The most important reason is that programming is a difficult task. It +requires programmers to deal with extremely low-level details in +complex software projects, and it is almost inevitable for programmers +to make small mistakes. People tend to assume that a piece of untested +software does not function properly. To deal this the problem, +software engineering techniques and formal method based techniques +have been proposed to help facilitate programming. These techniques +include various software engineering methodologies, design patterns, +sophisticated testing methods, program repair algorithms, model +checking algorithms and program synthesis methods. Some techniques +such as software engineering methodologies, design patterns and unit +testing have been practical and useful in boosting programming +productivity and the industry has been adopting these techniques for +more than a decade. The main reason for its popularity and longevity +is that these techniques are quite easy to execute for average +programmers. However, one dominant problem with these software +engineering approaches is that they are not rigorous enough. If the +specification of a method is not followed strictly, its benefits will +tend to be hindered. Advance methods with more rules have been +proposed, but the specification tend to be vague sometimes, which +results in execution difficulties. + +Some researchers switched their attention to applying formal methods +to tackle the difficulties in programming. Methods such as model +checking and program synthesis are much more rigorous than traditional +software engineering techniques, and its performance and benefit is +guaranteed once everything works accordingly. However, the impact of +these formal methods technique is much less compared to the influence +brought by the software engineering techniques, and the reason is that +it is very likely that a formal method based approach will not work +when large input is provided, because it will not terminate and +produce any useful result due to its large search space. These large +search spaces are inevitable, since formal methods techniques +typically deal with extremely complex problems in theory. However, +people have been trying to make formal method approaches practical by +introducing additional hints~\cite{Srivastava2012} or by restricting +the problem domain~\cite{Gulwani2011spreadsheet, Gulwani2011, + Gulwani2010}. + +With the advent of ``big data'', researchers started to pay attention +to the problems that were considered difficult or impossible, and this +has led to a significant advancement in the area of machine +learning. Similarly, as more and more open source repositories such as +\verb|Google Code|, \verb|Github| and \verb|SourceForge| have come +online where thousands of software projects and their source code +become available, researchers from the programming language community +also started to consider using ``big code'' to tackle the problems +that were considered difficult. With the help of ``big code'', many +new techniques that use formal methods and aim to facilitate +programming have been proposed. These techniques include program +property prediction~\cite{mishne12, Raychev2015}, API sequence +prediction~\cite{Raychev2014, murali2017neural, murali2017bayesian} +and small program generations~\cite{balog2016deepcoder}, Researchers +have showed that using data can indeed make the problem of synthesis +feasible~\cite{balog2016deepcoder} and practical tools that can help +human developers have also started to appear and programmers have +started to use those in practice~\cite{Raychev2015, murali2017neural}. + +Two major types of algorithms were used in the current literature of +applying formal methods to software engineering. The first type of +algorithms is based on combinatorial search. Combinatorial search +plays an important role in model checking and traditional program +synthesis problems~\cite{Manna1992, rajeev2013, lezama06, Long2015, + Douskos2015, Pnueli1989, Alur2015, Feser2015, Gulwani2010}. The main +idea is to first define a goal and also the steps for reaching the +goal. Programmers can then let the computer to search for an +solution. Typically heuristics are defined to reduce the search space +and to speed up the search time. The advantages of search-based +methods include (1) it is relatively easy to implement and it can be +used to solve problems where no efficient solutions exist, (2) +sometimes the algorithms can discover results that are hard to think +about as humans because computers can easily discover solutions in a +large search space quickly compared to humans, and (3) search-based +methods can solve problems that requires precision and precision is +typically required for analyzing computer programs. As SAT solvers and +SMT solvers became sophisticated, people have been able to use those +fast solvers to gain significant performance boost. The biggest +drawback of search-based methods is its high algorithmic +complexity. The search space grows indefinitely as the input size +graduately increases and this is the main reason why most traditional +model checking methods and program synthesis algorithms cannot deal +with large programs~\cite{Gulwani2010}. Another drawback that is worth +mentioning is that search-based methods tend to be quite +fragile. Those methods typically require inputs at every step to be +extremely precise, or the algorithm would not perform as expected. + +The second type of algorithms is based on learning. The idea of +learning is to let machine improve its performance using data in +solving a task and during the process learning-based methods are able +to capture idioms that are essential in solving the problem. These +idioms are typically hard to express or discover for humans. The large +amount of data was not available online until around 2012 and after +that researchers started applying learning-based methods to +programming systems~\cite{mishne12, Raychev2015, Raychev2014, + murali2017neural, murali2017bayesian, balog2016deepcoder}. The +biggest advantage brought by ``big data'' or ``big code'' is that it +allows researchers to find idioms that reduce the search space +significantly by using machine learning techniques. Examples include +relationships between variable names and their semantics information +and API call sequence idioms. These idioms cannot be made available +without people analyzing a large amount of data. Another advantage +compared to search-based method is its robustness and this is because +machine learning algorithms tend to use a large amount of data where +small noises are suppressed. Even though data-driven programming +systems are quite impactful, learning-based methods are not as +accessible as search-based methods because learning-based methods tend +to require data. In order to make learning-based algorithms perform +well in practice, a large amount of data is typically required and +this also leads to a large consumption on time and computation +resources which might not be available for everyone. + +In this thesis, we propose two additional corpus-driven systems that +aim to automate the process of software reuse and software +refactoring. In the current literature, the problem of software reuse +and refactoring have been both considered, but no systems can fully +automate software reuse and refactoring and some state-of-the-art +tools~\cite{Barr2015, balaban2005refactoring} still requires human to +provide additional hints. By using a large code corpus, we claim that +our systems can fully automate the process of software reuse and +refactoring without human intervention, and our systems can accomplish +the tasks efficiently and help human developers by boosting their +program productivity. + +\section{Program reuse via splicing} +We first introduce {\em program splicing}, a programming system that +helps human developers by automating the process of software +reuse. The most popular workflow nowadays consists of copying, pasting, +and modifying code available online and the reason for its domination +is that it is relatively easy to execute with the help of internet +search. However, this process inherits the drawbacks from +programming. This process requires extreme precision and carefulness +from programmers similar to normal programming. When a software reuse +task happens in a large and complicated software system, the cost of +making mistakes and spending enormous time on repairing might exceed +the benefit, let alone the fact that programmers sometimes do not even +try to fully understand the code they bring in from the internet so +long as it appears to work under their specific software +environment. This might impose a threat to their future software +development progress. + +Existing techniques that inspire the idea of our method can be divided +into two areas, search-based program synthesis techniques and +data-driven methods. The problem of program synthesis has been studied +for decades and researchers have been applying search-based methods to +tackle the problem for several decades~\cite{Pnueli1989, lezama06, + Srivastava2012, Alur2015, Feser2015, yaghmazadeh2016}. The main +benefit with respect to this work comes from the fact that +search-based method can produce results that require precision. This +is quite crucial when we aim to generate code snippets that needs to +interact with pre-written software pieces and examples might include +matching variables that are semantically similar or +equivalent. However, the problem with search-based method is that it +does not scale well into handling large inputs, which lead to large +search spaces, due to the complexity of the problem, and this is the +main reason why one of the competing system, $\mu$Scalpel, is not as +efficient as our splicing method. To alleviate the scalability +problem, people have proved that using ``big data'' can be quite +effective~\cite{Raychev2015, Raychev2014, raychev2016, + balog2016deepcoder, hindle2012naturalness}. Even though our splicing +method does not use any statistical method, we still reduce our search +space significantly and achieve high efficiency by relying on using natural +language to search a big code corpus~\cite{kashyap17}. + +One of our novelty in this work is that we combine the ideas from +search-based methods and data-driven methods. To use our programming +system for program reuse, a programmer starts by writing a ``draft'' +that mixes unfinished code, natural language comments, and correctness +requirements. A program synthesizer that interacts with a large, +searchable database of program snippets is used to automatically +complete the draft into a program that meets the requirements. The +synthesis process happens in two stages. First, the synthesizer +identifies a small number of programs in the +database~\cite{zou2018plinycompute} that are relevant to the synthesis +task. Next it uses an enumerative search to systematically fill the +draft with expressions and statements from these relevant +programs. The resulting program is returned to the programmer, who can +modify it and possibly invoke additional rounds of synthesis. + +We present an implementation of program splicing, called \system, for +the Java programming language. \system uses a corpus of over 3.5 +million procedures from an open-source software repository. Our +evaluation uses the system in a suite of everyday programming tasks, +and includes a comparison with a state-of-the-art competing +approach~\cite{Barr2015} as well as a user study. The results point to +the broad scope and scalability of program splicing and indicate that +the approach can significantly boost programmer productivity. + +\section{API refactoring using natural language and API synthesizer} +Software refactoring typically involves reconstructing existing source +code without modifying the functionality, and it is important and +almost a daily routine that programmer will perform to keep their +software projects clean and organized by constructing better +abstractions, deleting duplicate codes, breaking down a big +functionalities into small pieces that are universally applicable and +etc. Software system maintenance is extremely crucial, because a +software system can easily deteriorate and become obsolete and useless +if maintenance is not done properly and regularly, especially when the +external libraries it uses and the other underlying software systems +it depends on evolve rapidly nowadays. After several decades of +software development, most professional programmers have realized the +importance of software refactoring, and software refactoring has been +used heavily and regularly in the software industry. Similar to +software reuse, software refactoring also inherits the drawbacks from +programming. It again requires extreme accuracy from programmers, and +programmers tend to make mistakes when they deal with large and +complex software systems which typically involves keeping tracking of +tens or even hundreds of variables and function components. + +In this thesis, we focus on refactoring Application Programming +Interface (API) call sequences. An API consists of all the definitions +and usages of the resources available for external use from a software +system, and almost all software systems are built using various APIs +from other software systems nowadays. The process of API refactoring +mainly consists of changing the API call sequence defined in one +library into another sequence defined in another library. The benefit +of performing API refactoring is identical to general software +refactoring, but API refactoring has its specific benefits. The first +specific benefit allows programmers to reuse obsolete programs in which +programmers can adopt an obsolete programs into the existing +programming environment. Another benefit is that it can enhance the +performance of existing programs by refactoring the existing program +into another program that uses advanced libraries and platforms which +typically have better performance. + +The main difficulty of API refactoring comes from discovering +semantically equivalent API calls between two libraries and how to +instantiate the new API calls using the environment's variables so +that the resulting API call sequence does not alter the functionality +of the original API call sequence. One of the earliest +work~\cite{balaban2005refactoring} that aims to help API refactoring +requires human interventions. The user of the system needs to formally +specify the mapping between the API calls in two libraries, and the +system only focuses on refactoring \emph{individual} API calls instead +of refactoring sequences. Subsequent research in the area of API +refactoring has been limited to the problem of API mapping or API +translation. The goal is to discover two API calls that are +semantically equivalent. Two types of methods were developed to solve +the problem of API translation. The first one involves aligning two +API call sequences using a statistical model and the translations can +be extracted from the alignment +results~\cite{gokhale2013inferring}. This alignment method allows +people to find not only one-to-one API translations but also +one-to-many API translations, but the downside is that this method +requires a large amount of API call sequences to train the underlying +statistical method. Another method relies on natural language features +such as Javadoc to find semantically equivalent API +calls~\cite{pandita2015discovering, nguyen2016mapping, + zhong2009inferring}. Since Javadoc contains descriptions on the +nature of API calls, correct translations can be found by calculating +the similarities between the Javadoc texts of two API calls, and +calculating similarities can easily be done using a standard +\verb|Word2Vec| model which is able to calculate semantic similarities +between words. The only drawback of using natural language features as +the main glue is that it is difficult to discover one-to-many API +translations. + +In this thesis, we propose a new algorithm that automates the process +of API refactoring by combining the natural language +technique~\cite{pandita2015discovering} and an state-of-the-art API +call sequence synthesizer called +\verb|Bayou|~\cite{murali2017neural}. The input to our algorithm +includes an API call sequence and the name of the destination library, +and our algorithm can produce another semantically equivalent sequence +that uses only the API calls defined in the destination library. We +solves the problem in two steps. We first translate the input API call +sequences into a set of stand-alone API calls defined in the +destination library using natural language features as the main +driver~\cite{pandita2015discovering, nguyen2016mapping}. Then we feed +the stand-alone API calls into a API sequence synthesizer called +\emph{Bayou}~\cite{murali2017neural} which in turn synthesizes a +complete sequence of API calls. We have designed a series of benchmark +problems to evaluate the accuracy of our API refactoring algorithm, +and here the accuracy is defined as the percentage of corrected +generated API calls. The results show that our algorithm is able to +refactor API call sequences accurately, given that the two involved +libraries have similar coding practices and the input sequence is not +rare in the training data. diff --git a/documents/academic/rice_engi601/lu_writing_pdf.md b/documents/academic/rice_engi601/lu_writing_pdf.md new file mode 100644 index 0000000..e4a6eee --- /dev/null +++ b/documents/academic/rice_engi601/lu_writing_pdf.md @@ -0,0 +1,245 @@ +--- +type: paper +category: academic +person: Yanxin Lu +date: 2018 +source: writing.pdf +--- + +# Corpus-Driven Programming Systems for Program Synthesis and Refactoring + +**RICE UNIVERSITY** + +by + +**Yanxin Lu** + +A Thesis Submitted in Partial Fulfillment of the Requirements for the Degree + +**Doctor of Philosophy** + +Approved, Thesis Committee: + +- Swarat Chaudhuri, Chair -- Associate Professor of Computer Science +- Christopher Jermaine -- Professor of Computer Science +- Ankit B. Patel -- Assistant Professor of Electrical and Computer Engineering + +Houston, Texas +August, 2018 + +--- + +## Contents + +- List of Illustrations ... iii +- List of Tables ... iv +- 1 Introduction ... 1 + - 1.1 Program reuse via splicing ... 5 + - 1.2 API refactoring using natural language and API synthesizer ... 7 +- Bibliography ... 10 + +--- + +## Illustrations + +(Empty page) + +--- + +## Tables + +(Empty page) + +--- + +## Chapter 1: Introduction + +With the advancement in technologies such as artificial intelligence and also the expansions of high-tech companies, computer programming starts to become an important skill, and the demand for programmers has been growing dramatically in the past few years. The overall productivity has been boosted significantly thanks to the increasing number of programmers, but we still have not witnessed any boost in individual programming productivity. + +The most important reason is that programming is a difficult task. It requires programmers to deal with extremely low-level details in complex software projects, and it is almost inevitable for programmers to make small mistakes. People tend to assume that a piece of untested software does not function properly. To deal this the problem, software engineering techniques and formal method based techniques have been proposed to help facilitate programming. These techniques include various software engineering methodologies, design patterns, sophisticated testing methods, program repair algorithms, model checking algorithms and program synthesis methods. Some techniques such as software engineering methodologies, design patterns and unit testing have been practical and useful in boosting programming productivity and the industry has been adopting these techniques for more than a decade. The main reason for its popularity and longevity is that these techniques are quite easy to execute for average programmers. However, one dominant problem with these software engineering approaches is that they are not rigorous enough. If the specification of a method is not followed strictly, its benefits will tend to be hindered. Advance methods with more rules have been proposed, but the specification tend to be vague sometimes, which results in execution difficulties. + +Some researchers switched their attention to applying formal methods to tackle the difficulties in programming. Methods such as model checking and program synthesis are much more rigorous than traditional software engineering techniques, and its performance and benefit is guaranteed once everything works accordingly. However, the impact of these formal methods technique is much less compared to the influence brought by the software engineering techniques, and the reason is that it is very likely that a formal method based approach will not work when large input is provided, because it will not terminate and produce any useful result due to its large search space. These large search spaces are inevitable, since formal methods techniques typically deal with extremely complex problems in theory. However, people have been trying to make formal method approaches practical by introducing additional hints [1] or by restricting the problem domain [2, 3, 4]. + +With the advent of "big data", researchers started to pay attention to the problems that were considered difficult or impossible, and this has led to a significant advancement in the area of machine learning. Similarly, as more and more open source repositories such as `Google Code`, `Github` and `SourceForge` have come online where thousands of software projects and their source code become available, researchers from the programming language community also started to consider using "big code" to tackle the problems that were considered difficult. With the help of "big code", many new techniques that use formal methods and aim to facilitate programming have been proposed. These techniques include program property prediction [5, 6], API sequence prediction [7, 8, 9] and small program generations [10]. Researchers have showed that using data can indeed make the problem of synthesis feasible [10] and practical tools that can help human developers have also started to appear and programmers have started to use those in practice [6, 8]. + +Two major types of algorithms were used in the current literature of applying formal methods to software engineering. The first type of algorithms is based on combinatorial search. Combinatorial search plays an important role in model checking and traditional program synthesis problems [11, 12, 13, 14, 15, 16, 17, 18, 4]. The main idea is to first define a goal and also the steps for reaching the goal. Programmers can then let the computer to search for an solution. Typically heuristics are defined to reduce the search space and to speed up the search time. The advantages of search-based methods include (1) it is relatively easy to implement and it can be used to solve problems where no efficient solutions exist, (2) sometimes the algorithms can discover results that are hard to think about as humans because computers can easily discover solutions in a large search space quickly compared to humans, and (3) search-based methods can solve problems that requires precision and precision is typically required for analyzing computer programs. As SAT solvers and SMT solvers became sophisticated, people have been able to use those fast solvers to gain significant performance boost. The biggest drawback of search-based methods is its high algorithmic complexity. The search space grows indefinitely as the input size graduately increases and this is the main reason why most traditional model checking methods and program synthesis algorithms cannot deal with large programs [4]. Another drawback that is worth mentioning is that search-based methods tend to be quite fragile. Those methods typically require inputs at every step to be extremely precise, or the algorithm would not perform as expected. + +The second type of algorithms is based on learning. The idea of learning is to let machine improve its performance using data in solving a task and during the process learning-based methods are able to capture idioms that are essential in solving the problem. These idioms are typically hard to express or discover for humans. The large amount of data was not available online until around 2012 and after that researchers started applying learning-based methods to programming systems [5, 6, 7, 8, 9, 10]. The biggest advantage brought by "big data" or "big code" is that it allows researchers to find idioms that reduce the search space significantly by using machine learning techniques. Examples include relationships between variable names and their semantics information and API call sequence idioms. These idioms cannot be made available without people analyzing a large amount of data. Another advantage compared to search-based method is its robustness and this is because machine learning algorithms tend to use a large amount of data where small noises are suppressed. Even though data-driven programming systems are quite impactful, learning-based methods are not as accessible as search-based methods because learning-based methods tend to require data. In order to make learning-based algorithms perform well in practice, a large amount of data is typically required and this also leads to a large consumption on time and computation resources which might not be available for everyone. + +In this thesis, we propose two additional corpus-driven systems that aim to automate the process of software reuse and software refactoring. In the current literature, the problem of software reuse and refactoring have been both considered, but no systems can fully automate software reuse and refactoring and some state-of-the-art tools [20, 21] still requires human to provide additional hints. By using a large code corpus, we claim that our systems can fully automate the process of software reuse and refactoring without human intervention, and our systems can accomplish the tasks efficiently and help human developers by boosting their program productivity. + +### 1.1 Program reuse via splicing + +We first introduce *program splicing*, a programming system that helps human developers by automating the process of software reuse. The most popular workflow nowadays consists of copying, pasting, and modifying code available online and the reason for its domination is that it is relatively easy to execute with the help of internet search. However, this process inherits the drawbacks from programming. This process requires extreme precision and carefulness from programmers similar to normal programming. When a software reuse task happens in a large and complicated software system, the cost of making mistakes and spending enormous time on repairing might exceed the benefit, let alone the fact that programmers sometimes do not even try to fully understand the code they bring in from the internet so long as it appears to work under their specific software environment. This might impose a threat to their future software development progress. + +Existing techniques that inspire the idea of our method can be divided into two areas, search-based program synthesis techniques and data-driven methods. The problem of program synthesis has been studied for decades and researchers have been applying search-based methods to tackle the problem for several decades [16, 13, 1, 17, 18, 35]. The main benefit with respect to this work comes from the fact that search-based method can produce results that require precision. This is quite crucial when we aim to generate code snippets that needs to interact with pre-written software pieces and examples might include matching variables that are semantically similar or equivalent. However, the problem with search-based method is that it does not scale well into handling large inputs, which lead to large search spaces, due to the complexity of the problem, and this is the main reason why one of the competing system, uScalpel, is not as efficient as our splicing method. To alleviate the scalability problem, people have proved that using "big data" can be quite effective [6, 7, 59, 10, 57]. Even though our splicing method does not use any statistical method, we still reduce our search space significantly and achieve high efficiency by relying on using natural language to search a big code corpus [33]. + +One of our novelty in this work is that we combine the ideas from search-based methods and data-driven methods. To use our programming system for program reuse, a programmer starts by writing a "draft" that mixes unfinished code, natural language comments, and correctness requirements. A program synthesizer that interacts with a large, searchable database of program snippets is used to automatically complete the draft into a program that meets the requirements. The synthesis process happens in two stages. First, the synthesizer identifies a small number of programs in the database [19] that are relevant to the synthesis task. Next it uses an enumerative search to systematically fill the draft with expressions and statements from these relevant programs. The resulting program is returned to the programmer, who can modify it and possibly invoke additional rounds of synthesis. + +We present an implementation of program splicing, called SPLICER, for the Java programming language. SPLICER uses a corpus of over 3.5 million procedures from an open-source software repository. Our evaluation uses the system in a suite of everyday programming tasks, and includes a comparison with a state-of-the-art competing approach [20] as well as a user study. The results point to the broad scope and scalability of program splicing and indicate that the approach can significantly boost programmer productivity. + +### 1.2 API refactoring using natural language and API synthesizer + +Software refactoring typically involves reconstructing existing source code without modifying the functionality, and it is important and almost a daily routine that programmer will perform to keep their software projects clean and organized by constructing better abstractions, deleting duplicate codes, breaking down a big functionalities into small pieces that are universally applicable and etc. Software system maintenance is extremely crucial, because a software system can easily deteriorate and become obsolete and useless if maintenance is not done properly and regularly, especially when the external libraries it uses and the other underlying software systems it depends on evolve rapidly nowadays. After several decades of software development, most professional programmers have realized the importance of software refactoring, and software refactoring has been used heavily and regularly in the software industry. Similar to software reuse, software refactoring also inherits the drawbacks from programming. It again requires extreme accuracy from programmers, and programmers tend to make mistakes when they deal with large and complex software systems which typically involves keeping tracking of tens or even hundreds of variables and function components. + +In this thesis, we focus on refactoring Application Programming Interface (API) call sequences. An API consists of all the definitions and usages of the resources available for external use from a software system, and almost all software systems are built using various APIs from other software systems nowadays. The process of API refactoring mainly consists of changing the API call sequence defined in one library into another sequence defined in another library. The benefit of performing API refactoring is identical to general software refactoring, but API refactoring has its specific benefits. The first specific benefit allows programmers to reuse obsolete programs in which programmers can adopt an obsolete programs into the existing programming environment. Another benefit is that it can enhance the performance of existing programs by refactoring the existing program into another program that uses advanced libraries and platforms which typically have better performance. + +The main difficulty of API refactoring comes from discovering semantically equivalent API calls between two libraries and how to instantiate the new API calls using the environment's variables so that the resulting API call sequence does not alter the functionality of the original API call sequence. One of the earliest work [21] that aims to help API refactoring requires human interventions. The user of the system needs to formally specify the mapping between the API calls in two libraries, and the system only focuses on refactoring *individual* API calls instead of refactoring sequences. Subsequent research in the area of API refactoring has been limited to the problem of API mapping or API translation. The goal is to discover two API calls that are semantically equivalent. Two types of methods were developed to solve the problem of API translation. The first one involves aligning two API call sequences using a statistical model and the translations can be extracted from the alignment results [23]. This alignment method allows people to find not only one-to-one API translations but also one-to-many API translations, but the downside is that this method requires a large amount of API call sequences to train the underlying statistical method. Another method relies on natural language features such as Javadoc to find semantically equivalent API calls [24, 25, 22]. Since Javadoc contains descriptions on the nature of API calls, correct translations can be found by calculating the similarities between the Javadoc texts of two API calls, and calculating similarities can easily be done using a standard `Word2Vec` model which is able to calculate semantic similarities between words. The only drawback of using natural language features as the main glue is that it is difficult to discover one-to-many API translations. + +In this thesis, we propose a new algorithm that automates the process of API refactoring by combining the natural language technique [24] and an state-of-the-art API call sequence synthesizer called `Bayou` [8]. The input to our algorithm includes an API call sequence and the name of the destination library, and our algorithm can produce another semantically equivalent sequence that uses only the API calls defined in the destination library. We solves the problem in two steps. We first translate the input API call sequences into a set of stand-alone API calls defined in the destination library using natural language features as the main driver [24, 25]. Then we feed the stand-alone API calls into a API sequence synthesizer called *Bayou* [8] which in turn synthesizes a complete sequence of API calls. We have designed a series of benchmark problems to evaluate the accuracy of our API refactoring algorithm, and here the accuracy is defined as the percentage of corrected generated API calls. The results show that our algorithm is able to refactor API call sequences accurately, given that the two involved libraries have similar coding practices and the input sequence is not rare in the training data. + +--- + +## Bibliography + +[1] S. Srivastava, S. Gulwani, and J. S. Foster, "Template-based program verification and program synthesis," *International Journal on Software Tools for Technology Transfer*, vol. 15, no. 5, pp. 497-518, 2012. + +[2] S. Gulwani, "Automating string processing in spreadsheets using input-output examples," in *Proceedings of the 38th Annual ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages*, POPL '11, (New York, NY, USA), pp. 317-330, ACM, 2011. + +[3] S. Gulwani, V. A. Korthikanti, and A. Tiwari, "Synthesizing geometry constructions," in *Proceedings of the 32Nd ACM SIGPLAN Conference on Programming Language Design and Implementation*, PLDI '11, (New York, NY, USA), pp. 50-61, ACM, 2011. + +[4] S. Gulwani, "Dimensions in program synthesis," in *Proceedings of the 12th International ACM SIGPLAN Symposium on Principles and Practice of Declarative Programming*, PPDP '10, (New York, NY, USA), pp. 13-24, ACM, 2010. + +[5] A. Mishne, S. Shoham, and E. Yahav, "Typestate-based Semantic Code Search over Partial Programs," in *Proceedings of the ACM International Conference on Object Oriented Programming Systems Languages and Applications*, OOPSLA '12, (New York, NY, USA), pp. 997-1016, ACM, 2012. + +[6] V. Raychev, M. Vechev, and A. Krause, "Predicting program properties from 'big code'," in *Proceedings of the 42Nd Annual ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages*, POPL '15, (New York, NY, USA), pp. 111-124, ACM, 2015. + +[7] V. Raychev, M. Vechev, and E. Yahav, "Code completion with statistical language models," in *Proceedings of the 35th ACM SIGPLAN Conference on Programming Language Design and Implementation*, PLDI '14, (New York, NY, USA), pp. 419-428, ACM, 2014. + +[8] V. Murali, L. Qi, S. Chaudhuri, and C. Jermaine, "Neural sketch learning for conditional program generation," *arXiv preprint arXiv:1703.05698*, 2017. + +[9] V. Murali, S. Chaudhuri, and C. Jermaine, "Bayesian specification learning for finding api usage errors," in *Proceedings of the 2017 11th Joint Meeting on Foundations of Software Engineering*, pp. 151-162, ACM, 2017. + +[10] M. Balog, A. L. Gaunt, M. Brockschmidt, S. Nowozin, and D. Tarlow, "Deepcoder: Learning to write programs," *arXiv preprint arXiv:1611.01989*, 2016. + +[11] Z. Manna and R. Waldinger, "Fundamentals of deductive program synthesis," *IEEE Trans. Softw. Eng.*, vol. 18, pp. 674-704, Aug. 1992. + +[12] R. Alur, R. Bodik, G. Juniwal, M. M. K. Martin, M. Raghothaman, S. Seshia, R. Singh, A. Solar-Lezama, E. Torlak, and A. Udupa, "Syntax-guided synthesis," in *Formal Methods in Computer-Aided Design (FMCAD), 2013*, pp. 1-8, IEEE, Oct. 2013. + +[13] A. Solar-Lezama, L. Tancau, R. Bodik, S. Seshia, and V. Saraswat, "Combinatorial sketching for finite programs," in *Proceedings of the 12th International Conference on Architectural Support for Programming Languages and Operating Systems*, ASPLOS XII, (New York, NY, USA), pp. 404-415, ACM, 2006. + +[14] F. Long and M. Rinard, "Staged program repair with condition synthesis," in *Proceedings of the 2015 10th Joint Meeting on Foundations of Software Engineering*, ESEC/FSE 2015, (New York, NY, USA), pp. 166-178, ACM, 2015. + +[15] S. Sidiroglou-Douskos, E. Lahtinen, F. Long, and M. Rinard, "Automatic error elimination by horizontal code transfer across multiple applications," in *Proceedings of the 36th ACM SIGPLAN Conference on Programming Language Design and Implementation*, PLDI 2015, (New York, NY, USA), pp. 43-54, ACM, 2015. + +[16] A. Pnueli and R. Rosner, "On the synthesis of an asynchronous reactive module," in *Proceedings of the 16th International Colloquium on Automata, Languages and Programming*, ICALP '89, (London, UK, UK), pp. 652-671, Springer-Verlag, 1989. + +[17] R. Alur, R. Bodik, G. Juniwal, M. M. Martin, M. Raghothaman, S. A. Seshia, R. Singh, A. Solar-Lezama, E. Torlak, and A. Udupa, "Syntax-guided synthesis," *Dependable Software Systems Engineering*, vol. 40, pp. 1-25, 2015. + +[18] J. K. Feser, S. Chaudhuri, and I. Dillig, "Synthesizing data structure transformations from input-output examples," in *Proceedings of the 36th ACM SIGPLAN Conference on Programming Language Design and Implementation*, PLDI 2015, (New York, NY, USA), pp. 229-239, ACM, 2015. + +[19] J. Zou, R. M. Barnett, T. Lorido-Botran, S. Luo, C. Monroy, S. Sikdar, K. Teymourian, B. Yuan, and C. Jermaine, "Plinycompute: A platform for high-performance, distributed, data-intensive tool development," in *Proceedings of the 2018 International Conference on Management of Data*, pp. 1189-1204, ACM, 2018. + +[20] E. T. Barr, M. Harman, Y. Jia, A. Marginean, and J. Petke, "Automated software transplantation," in *Proceedings of the 2015 International Symposium on Software Testing and Analysis*, ISSTA 2015, (New York, NY, USA), pp. 257-269, ACM, 2015. + +[21] I. Balaban, F. Tip, and R. Fuhrer, "Refactoring support for class library migration," in *ACM SIGPLAN Notices*, vol. 40, pp. 265-279, ACM, 2005. + +[22] H. Zhong, L. Zhang, T. Xie, and H. Mei, "Inferring resource specifications from natural language api documentation," in *Proceedings of the 2009 IEEE/ACM International Conference on Automated Software Engineering*, pp. 307-318, IEEE Computer Society, 2009. + +[23] A. Gokhale, V. Ganapathy, and Y. Padmanaban, "Inferring likely mappings between apis," in *Proceedings of the 2013 International Conference on Software Engineering*, pp. 82-91, IEEE Press, 2013. + +[24] R. Pandita, R. P. Jetley, S. D. Sudarsan, and L. Williams, "Discovering likely mappings between apis using text mining," in *Source Code Analysis and Manipulation (SCAM), 2015 IEEE 15th International Working Conference on*, pp. 231-240, IEEE, 2015. + +[25] T. D. Nguyen, A. T. Nguyen, and T. N. Nguyen, "Mapping api elements for code migration with vector representations," in *Software Engineering Companion (ICSE-C), IEEE/ACM International Conference on*, pp. 756-758, IEEE, 2016. + +[26] M. Raghothaman, Y. Wei, and Y. Hamadi, "Swim: Synthesizing what i mean," *arXiv preprint arXiv:1511.08497*, 2015. + +[27] M. Kim, L. Bergman, T. Lau, and D. Notkin, "An ethnographic study of copy and paste programming practices in oopl," in *Empirical Software Engineering, 2004. ISESE'04. Proceedings. 2004 International Symposium on*, pp. 83-92, IEEE, 2004. + +[28] E. Juergens, F. Deissenboeck, B. Hummel, and S. Wagner, "Do code clones matter?," in *Proceedings of the 31st International Conference on Software Engineering*, pp. 485-495, IEEE Computer Society, 2009. + +[29] H. Sajnani, V. Saini, J. Ossher, and C. Lopes, "Is popularity a measure of quality? an analysis of maven components," in *Software Maintenance and Evolution (ICSME), 2014 IEEE International Conference on*, pp. 231-240, Sept 2014. + +[30] J. Ossher, H. Sajnani, and C. Lopes, "Astra: Bottom-up construction of structured artifact repositories," in *Reverse Engineering (WCRE), 2012 19th Working Conference on*, pp. 41-50, 2012. + +[31] S. Bajracharya, J. Ossher, and C. Lopes, "Sourcerer: An infrastructure for large-scale collection and analysis of open-source code," *Science of Computer Programming*, vol. 79, pp. 241-259, 2014. + +[32] A. Solar-Lezama, "The sketching approach to program synthesis," in *Asian Symposium on Programming Languages and Systems*, pp. 4-13, Springer, 2009. + +[33] V. Kashyap, D. B. Brown, B. Liblit, D. Melski, and T. W. Reps, "Source forager: A search engine for similar source code," *CoRR*, vol. abs/1706.02769, 2017. + +[34] H. Feild, D. Binkley, and D. Lawrie, "An empirical comparison of techniques for extracting concept abbreviations from identifiers," in *Proceedings of IASTED International Conference on Software Engineering and Applications (SEA06)*, Citeseer, 2006. + +[35] N. Yaghmazadeh, C. Klinger, I. Dillig, and S. Chaudhuri, "Synthesizing transformations on hierarchically structured data," in *Proceedings of the 37th ACM SIGPLAN Conference on Programming Language Design and Implementation*, PLDI '16, (New York, NY, USA), pp. 508-521, ACM, 2016. + +[36] Y. Feng, R. Martins, Y. Wang, I. Dillig, and T. W. Reps, "Component-based synthesis for complex apis," in *Proceedings of the 44th ACM SIGPLAN Symposium on Principles of Programming Languages*, POPL 2017, (New York, NY, USA), pp. 599-612, ACM, 2017. + +[37] Y. Feng, R. Martins, J. V. Geffen, I. Dillig, and S. Chaudhuri, "Component-based synthesis of table consolidation and transformation tasks from examples," *CoRR*, vol. abs/1611.07502, 2016. + +[38] O. Polozov and S. Gulwani, "Flashmeta: A framework for inductive program synthesis," *ACM SIGPLAN Notices*, vol. 50, no. 10, pp. 107-126, 2015. + +[39] A. Udupa, A. Raghavan, J. V. Deshmukh, S. Mador-Haim, M. M. Martin, and R. Alur, "TRANSIT: specifying protocols with concolic snippets," *ACM SIGPLAN Notices*, vol. 48, no. 6, pp. 287-296, 2013. + +[40] R. Alur, R. Bodik, G. Juniwal, M. M. Martin, M. Raghothaman, S. A. Seshia, R. Singh, A. Solar-Lezama, E. Torlak, and A. Udupa, "Syntax-guided synthesis," *Dependable Software Systems Engineering*, vol. 40, pp. 1-25, 2015. + +[41] S. P. Reiss, "Semantics-based code search," in *Proceedings of the 31st International Conference on Software Engineering*, ICSE '09, (Washington, DC, USA), pp. 243-253, IEEE Computer Society, 2009. + +[42] O. Hummel, W. Janjic, and C. Atkinson, "Code conjurer: Pulling reusable software out of thin air," *IEEE software*, vol. 25, no. 5, 2008. + +[43] O. A. Lazzarini Lemos, S. Bajracharya, J. Ossher, P. C. Masiero, and C. Lopes, "Applying test-driven code search to the reuse of auxiliary functionality," in *Proceedings of the 2009 ACM symposium on Applied Computing*, pp. 476-482, ACM, 2009. + +[44] Y. Wang, Y. Feng, R. Martins, A. Kaushik, I. Dillig, and S. P. Reiss, "Hunter: Next-generation code reuse for java," in *Proceedings of the 2016 24th ACM SIGSOFT International Symposium on Foundations of Software Engineering*, FSE 2016, (New York, NY, USA), pp. 1028-1032, ACM, 2016. + +[45] T. Gvero and V. Kuncak, "Interactive synthesis using free-form queries," in *2015 IEEE/ACM 37th IEEE International Conference on Software Engineering*, vol. 2, pp. 689-692, IEEE, 2015. + +[46] "Beanshell," 2017. Accessed: 2017-04-04. + +[47] "Nailgun," 2017. Accessed: 2017-04-04. + +[48] B. Efron, *The jackknife, the bootstrap and other resampling plans*. SIAM, 1982. + +[49] "Jsoup," 2017. Accessed: 2017-04-02. + +[50] "Stackoverflow," 2017. Accessed: 2017-04-02. + +[51] R. Holmes and R. J. Walker, "Systematizing pragmatic software reuse," *ACM Trans. Softw. Eng. Methodol.*, vol. 21, pp. 20:1-20:44, Feb. 2013. + +[52] J. Petke, M. Harman, W. B. Langdon, and W. Weimer, "Using genetic improvement and code transplants to specialise a c++ program to a problem class," in *European Conference on Genetic Programming*, pp. 137-149, Springer, 2014. + +[53] M. Harman, Y. Jia, and W. B. Langdon, "Babel pidgin: Sbse can grow and graft entirely new functionality into a real world system," in *International Symposium on Search Based Software Engineering*, pp. 247-252, Springer, 2014. + +[54] Y. Jia, M. Harman, W. B. Langdon, and A. Marginean, "Grow and serve: Growing django citation services using sbse," in *International Symposium on Search Based Software Engineering*, pp. 269-275, Springer, 2015. + +[55] A. Marginean, E. T. Barr, M. Harman, and Y. Jia, "Automated transplantation of call graph and layout features into kate," in *International Symposium on Search Based Software Engineering*, pp. 262-268, Springer, 2015. + +[56] S. Sidiroglou-Douskos, E. Lahtinen, F. Long, and M. Rinard, "Automatic error elimination by horizontal code transfer across multiple applications," in *ACM SIGPLAN Notices*, vol. 50, pp. 43-54, ACM, 2015. + +[57] A. Hindle, E. T. Barr, Z. Su, M. Gabel, and P. Devanbu, "On the naturalness of software," in *2012 34th International Conference on Software Engineering (ICSE)*, pp. 837-847, IEEE, 2012. + +[58] A. T. Nguyen and T. N. Nguyen, "Graph-based statistical language model for code," in *Proceedings of the 37th International Conference on Software Engineering-Volume 1*, pp. 858-868, IEEE Press, 2015. + +[59] V. Raychev, P. Bielik, M. Vechev, and A. Krause, "Learning programs from noisy data," in *ACM SIGPLAN Notices*, vol. 51, pp. 761-774, ACM, 2016. + +[60] K. Narasimhan and C. Reichenbach, "Copy and paste redeemed," in *Automated Software Engineering (ASE), 2015 30th IEEE/ACM International Conference on*, pp. 630-640, IEEE, 2015. + +[61] C. K. Roy, J. R. Cordy, and R. Koschke, "Comparison and evaluation of code clone detection techniques and tools: A qualitative approach," *Science of Computer Programming*, vol. 74, no. 7, pp. 470-495, 2009. + +[62] L. Jiang, G. Misherghi, Z. Su, and S. Glondu, "Deckard: Scalable and accurate tree-based detection of code clones," in *Proceedings of the 29th international conference on Software Engineering*, pp. 96-105, IEEE Computer Society, 2007. + +[63] I. Keivanloo, J. Rilling, and Y. Zou, "Spotting working code examples," in *Proceedings of the 36th International Conference on Software Engineering*, pp. 664-675, ACM, 2014. + +[64] Y. David and E. Yahav, "Tracelet-based code search in executables," in *ACM SIGPLAN Notices*, vol. 49, pp. 349-360, ACM, 2014. + +[65] M. Grechanik, C. Fu, Q. Xie, C. McMillan, D. Poshyvanyk, and C. Cumby, "A search engine for finding highly relevant applications," in *ACM/IEEE International Conference on Software Engineering*, (New York, New York, USA), ACM Press, May 2010. + +[66] C. McMillan, M. Grechanik, D. Poshyvanyk, Q. Xie, and C. Fu, "Portfolio: finding relevant functions and their usage," in *International conference on Software engineering*, (New York, New York, USA), ACM Press, May 2011. + +[67] K. T. Stolee and S. Elbaum, "Toward semantic search via smt solver," in *Proceedings of the ACM SIGSOFT 20th International Symposium on the Foundations of Software Engineering*, p. 25, ACM, 2012. + +[68] Y. Ke, K. T. Stolee, C. Le Goues, and Y. Brun, "Repairing programs with semantic code search (t)," in *Automated Software Engineering (ASE), 2015 30th IEEE/ACM International Conference on*, pp. 295-306, IEEE, 2015. + +[69] W. Wu, Y.-G. Gueheneuc, G. Antoniol, and M. Kim, "Aura: a hybrid approach to identify framework evolution," in *Software Engineering, 2010 ACM/IEEE 32nd International Conference on*, vol. 1, pp. 325-334, IEEE, 2010. + +[70] A. T. Nguyen, H. A. Nguyen, T. T. Nguyen, and T. N. Nguyen, "Statistical learning approach for mining api usage mappings for code migration," in *Proceedings of the 29th ACM/IEEE international conference on Automated software engineering*, pp. 457-468, ACM, 2014. + +[71] H. Zhong, S. Thummalapenta, T. Xie, L. Zhang, and Q. Wang, "Mining api mapping for language migration," in *Proceedings of the 32nd ACM/IEEE International Conference on Software Engineering-Volume 1*, pp. 195-204, ACM, 2010. + +[72] A. Gokhale, D. Kim, and V. Ganapathy, "Data-driven inference of api mappings," in *Proceedings of the 2nd Workshop on Programming for Mobile & Touch*, pp. 29-32, ACM, 2014. + +[73] H. A. Nguyen, T. T. Nguyen, G. Wilson Jr, A. T. Nguyen, M. Kim, and T. N. Nguyen, "A graph-based approach to api usage adaptation," in *ACM Sigplan Notices*, vol. 45, pp. 302-321, ACM, 2010. + +[74] C. Teyton, J.-R. Falleri, and X. Blanc, "Mining library migration graphs," in *19th Working Conference on Reverse Engineering, WCRE 2012, Kingston, ON, Canada, October 15-18, 2012*, pp. 289-298, 2012. + +[75] H. D. Phan, A. T. Nguyen, T. D. Nguyen, and T. N. Nguyen, "Statistical migration of api usages," in *Software Engineering Companion (ICSE-C), 2017 IEEE/ACM 39th International Conference on*, pp. 47-50, IEEE, 2017. + +[76] T. T. Nguyen, H. A. Nguyen, N. H. Pham, J. M. Al-Kofahi, and T. N. Nguyen, "Graph-based mining of multiple object usage patterns," in *Proceedings of the the 7th joint meeting of the European software engineering conference and the ACM SIGSOFT symposium on The foundations of software engineering*, pp. 383-392, ACM, 2009. diff --git a/documents/academic/rice_engi601/lu_writing_tex.md b/documents/academic/rice_engi601/lu_writing_tex.md new file mode 100644 index 0000000..6a94ffe --- /dev/null +++ b/documents/academic/rice_engi601/lu_writing_tex.md @@ -0,0 +1,301 @@ +--- +type: paper +category: academic +person: Yanxin Lu +date: 2018 +source: writing.tex +--- + +# Thesis Introduction - LaTeX Source (writing.tex) + +This is the LaTeX source file for the thesis introduction chapter. The compiled PDF version is available as `lu_writing.pdf`. + +```latex +\chapter{Introduction} +\label{ch:intro} +With the advancement in technologies such as artificial intelligence +and also the expansions of high-tech companies, computer programming +starts to become an important skill, and the demand for programmers +has been growing dramatically in the past few years. The overall +productivity has been boosted significantly thanks to the increasing +number of programmers, but we still have not witnessed any boost in +individual programming productivity. + +The most important reason is that programming is a difficult task. It +requires programmers to deal with extremely low-level details in +complex software projects, and it is almost inevitable for programmers +to make small mistakes. People tend to assume that a piece of untested +software does not function properly. To deal this the problem, +software engineering techniques and formal method based techniques +have been proposed to help facilitate programming. These techniques +include various software engineering methodologies, design patterns, +sophisticated testing methods, program repair algorithms, model +checking algorithms and program synthesis methods. Some techniques +such as software engineering methodologies, design patterns and unit +testing have been practical and useful in boosting programming +productivity and the industry has been adopting these techniques for +more than a decade. The main reason for its popularity and longevity +is that these techniques are quite easy to execute for average +programmers. However, one dominant problem with these software +engineering approaches is that they are not rigorous enough. If the +specification of a method is not followed strictly, its benefits will +tend to be hindered. Advance methods with more rules have been +proposed, but the specification tend to be vague sometimes, which +results in execution difficulties. + +Some researchers switched their attention to applying formal methods +to tackle the difficulties in programming. Methods such as model +checking and program synthesis are much more rigorous than traditional +software engineering techniques, and its performance and benefit is +guaranteed once everything works accordingly. However, the impact of +these formal methods technique is much less compared to the influence +brought by the software engineering techniques, and the reason is that +it is very likely that a formal method based approach will not work +when large input is provided, because it will not terminate and +produce any useful result due to its large search space. These large +search spaces are inevitable, since formal methods techniques +typically deal with extremely complex problems in theory. However, +people have been trying to make formal method approaches practical by +introducing additional hints~\cite{Srivastava2012} or by restricting +the problem domain~\cite{Gulwani2011spreadsheet, Gulwani2011, + Gulwani2010}. + +With the advent of ``big data'', researchers started to pay attention +to the problems that were considered difficult or impossible, and this +has led to a significant advancement in the area of machine +learning. Similarly, as more and more open source repositories such as +\verb|Google Code|, \verb|Github| and \verb|SourceForge| have come +online where thousands of software projects and their source code +become available, researchers from the programming language community +also started to consider using ``big code'' to tackle the problems +that were considered difficult. With the help of ``big code'', many +new techniques that use formal methods and aim to facilitate +programming have been proposed. These techniques include program +property prediction~\cite{mishne12, Raychev2015}, API sequence +prediction~\cite{Raychev2014, murali2017neural, murali2017bayesian} +and small program generations~\cite{balog2016deepcoder}, Researchers +have showed that using data can indeed make the problem of synthesis +feasible~\cite{balog2016deepcoder} and practical tools that can help +human developers have also started to appear and programmers have +started to use those in practice~\cite{Raychev2015, murali2017neural}. + +Two major types of algorithms were used in the current literature of +applying formal methods to software engineering. The first type of +algorithms is based on combinatorial search. Combinatorial search +plays an important role in model checking and traditional program +synthesis problems~\cite{Manna1992, rajeev2013, lezama06, Long2015, + Douskos2015, Pnueli1989, Alur2015, Feser2015, Gulwani2010}. The main +idea is to first define a goal and also the steps for reaching the +goal. Programmers can then let the computer to search for an +solution. Typically heuristics are defined to reduce the search space +and to speed up the search time. The advantages of search-based +methods include (1) it is relatively easy to implement and it can be +used to solve problems where no efficient solutions exist, (2) +sometimes the algorithms can discover results that are hard to think +about as humans because computers can easily discover solutions in a +large search space quickly compared to humans, and (3) search-based +methods can solve problems that requires precision and precision is +typically required for analyzing computer programs. As SAT solvers and +SMT solvers became sophisticated, people have been able to use those +fast solvers to gain significant performance boost. The biggest +drawback of search-based methods is its high algorithmic +complexity. The search space grows indefinitely as the input size +graduately increases and this is the main reason why most traditional +model checking methods and program synthesis algorithms cannot deal +with large programs~\cite{Gulwani2010}. Another drawback that is worth +mentioning is that search-based methods tend to be quite +fragile. Those methods typically require inputs at every step to be +extremely precise, or the algorithm would not perform as expected. + +The second type of algorithms is based on learning. The idea of +learning is to let machine improve its performance using data in +solving a task and during the process learning-based methods are able +to capture idioms that are essential in solving the problem. These +idioms are typically hard to express or discover for humans. The large +amount of data was not available online until around 2012 and after +that researchers started applying learning-based methods to +programming systems~\cite{mishne12, Raychev2015, Raychev2014, + murali2017neural, murali2017bayesian, balog2016deepcoder}. The +biggest advantage brought by ``big data'' or ``big code'' is that it +allows researchers to find idioms that reduce the search space +significantly by using machine learning techniques. Examples include +relationships between variable names and their semantics information +and API call sequence idioms. These idioms cannot be made available +without people analyzing a large amount of data. Another advantage +compared to search-based method is its robustness and this is because +machine learning algorithms tend to use a large amount of data where +small noises are suppressed. Even though data-driven programming +systems are quite impactful, learning-based methods are not as +accessible as search-based methods because learning-based methods tend +to require data. In order to make learning-based algorithms perform +well in practice, a large amount of data is typically required and +this also leads to a large consumption on time and computation +resources which might not be available for everyone. + +In this thesis, we propose two additional corpus-driven systems that +aim to automate the process of software reuse and software +refactoring. In the current literature, the problem of software reuse +and refactoring have been both considered, but no systems can fully +automate software reuse and refactoring and some state-of-the-art +tools~\cite{Barr2015, balaban2005refactoring} still requires human to +provide additional hints. By using a large code corpus, we claim that +our systems can fully automate the process of software reuse and +refactoring without human intervention, and our systems can accomplish +the tasks efficiently and help human developers by boosting their +program productivity. + +\section{Program reuse via splicing} +We first introduce {\em program splicing}, a programming system that +helps human developers by automating the process of software +reuse. The most popular workflow nowadays consists of copying, pasting, +and modifying code available online and the reason for its domination +is that it is relatively easy to execute with the help of internet +search. However, this process inherits the drawbacks from +programming. This process requires extreme precision and carefulness +from programmers similar to normal programming. When a software reuse +task happens in a large and complicated software system, the cost of +making mistakes and spending enormous time on repairing might exceed +the benefit, let alone the fact that programmers sometimes do not even +try to fully understand the code they bring in from the internet so +long as it appears to work under their specific software +environment. This might impose a threat to their future software +development progress. + +Existing techniques that inspire the idea of our method can be divided +into two areas, search-based program synthesis techniques and +data-driven methods. The problem of program synthesis has been studied +for decades and researchers have been applying search-based methods to +tackle the problem for several decades~\cite{Pnueli1989, lezama06, + Srivastava2012, Alur2015, Feser2015, yaghmazadeh2016}. The main +benefit with respect to this work comes from the fact that +search-based method can produce results that require precision. This +is quite crucial when we aim to generate code snippets that needs to +interact with pre-written software pieces and examples might include +matching variables that are semantically similar or +equivalent. However, the problem with search-based method is that it +does not scale well into handling large inputs, which lead to large +search spaces, due to the complexity of the problem, and this is the +main reason why one of the competing system, $\mu$Scalpel, is not as +efficient as our splicing method. To alleviate the scalability +problem, people have proved that using ``big data'' can be quite +effective~\cite{Raychev2015, Raychev2014, raychev2016, + balog2016deepcoder, hindle2012naturalness}. Even though our splicing +method does not use any statistical method, we still reduce our search +space significantly and achieve high efficiency by relying on using natural +language to search a big code corpus~\cite{kashyap17}. + +One of our novelty in this work is that we combine the ideas from +search-based methods and data-driven methods. To use our programming +system for program reuse, a programmer starts by writing a ``draft'' +that mixes unfinished code, natural language comments, and correctness +requirements. A program synthesizer that interacts with a large, +searchable database of program snippets is used to automatically +complete the draft into a program that meets the requirements. The +synthesis process happens in two stages. First, the synthesizer +identifies a small number of programs in the +database~\cite{zou2018plinycompute} that are relevant to the synthesis +task. Next it uses an enumerative search to systematically fill the +draft with expressions and statements from these relevant +programs. The resulting program is returned to the programmer, who can +modify it and possibly invoke additional rounds of synthesis. + +We present an implementation of program splicing, called \system, for +the Java programming language. \system uses a corpus of over 3.5 +million procedures from an open-source software repository. Our +evaluation uses the system in a suite of everyday programming tasks, +and includes a comparison with a state-of-the-art competing +approach~\cite{Barr2015} as well as a user study. The results point to +the broad scope and scalability of program splicing and indicate that +the approach can significantly boost programmer productivity. + +\section{API refactoring using natural language and API synthesizer} +Software refactoring typically involves reconstructing existing source +code without modifying the functionality, and it is important and +almost a daily routine that programmer will perform to keep their +software projects clean and organized by constructing better +abstractions, deleting duplicate codes, breaking down a big +functionalities into small pieces that are universally applicable and +etc. Software system maintenance is extremely crucial, because a +software system can easily deteriorate and become obsolete and useless +if maintenance is not done properly and regularly, especially when the +external libraries it uses and the other underlying software systems +it depends on evolve rapidly nowadays. After several decades of +software development, most professional programmers have realized the +importance of software refactoring, and software refactoring has been +used heavily and regularly in the software industry. Similar to +software reuse, software refactoring also inherits the drawbacks from +programming. It again requires extreme accuracy from programmers, and +programmers tend to make mistakes when they deal with large and +complex software systems which typically involves keeping tracking of +tens or even hundreds of variables and function components. + +In this thesis, we focus on refactoring Application Programming +Interface (API) call sequences. An API consists of all the definitions +and usages of the resources available for external use from a software +system, and almost all software systems are built using various APIs +from other software systems nowadays. The process of API refactoring +mainly consists of changing the API call sequence defined in one +library into another sequence defined in another library. The benefit +of performing API refactoring is identical to general software +refactoring, but API refactoring has its specific benefits. The first +specific benefit allows programmers to reuse obsolete programs in which +programmers can adopt an obsolete programs into the existing +programming environment. Another benefit is that it can enhance the +performance of existing programs by refactoring the existing program +into another program that uses advanced libraries and platforms which +typically have better performance. + +The main difficulty of API refactoring comes from discovering +semantically equivalent API calls between two libraries and how to +instantiate the new API calls using the environment's variables so +that the resulting API call sequence does not alter the functionality +of the original API call sequence. One of the earliest +work~\cite{balaban2005refactoring} that aims to help API refactoring +requires human interventions. The user of the system needs to formally +specify the mapping between the API calls in two libraries, and the +system only focuses on refactoring \emph{individual} API calls instead +of refactoring sequences. Subsequent research in the area of API +refactoring has been limited to the problem of API mapping or API +translation. The goal is to discover two API calls that are +semantically equivalent. Two types of methods were developed to solve +the problem of API translation. The first one involves aligning two +API call sequences using a statistical model and the translations can +be extracted from the alignment +results~\cite{gokhale2013inferring}. This alignment method allows +people to find not only one-to-one API translations but also +one-to-many API translations, but the downside is that this method +requires a large amount of API call sequences to train the underlying +statistical method. Another method relies on natural language features +such as Javadoc to find semantically equivalent API +calls~\cite{pandita2015discovering, nguyen2016mapping, + zhong2009inferring}. Since Javadoc contains descriptions on the +nature of API calls, correct translations can be found by calculating +the similarities between the Javadoc texts of two API calls, and +calculating similarities can easily be done using a standard +\verb|Word2Vec| model which is able to calculate semantic similarities +between words. The only drawback of using natural language features as +the main glue is that it is difficult to discover one-to-many API +translations. + +In this thesis, we propose a new algorithm that automates the process +of API refactoring by combining the natural language +technique~\cite{pandita2015discovering} and an state-of-the-art API +call sequence synthesizer called +\verb|Bayou|~\cite{murali2017neural}. The input to our algorithm +includes an API call sequence and the name of the destination library, +and our algorithm can produce another semantically equivalent sequence +that uses only the API calls defined in the destination library. We +solves the problem in two steps. We first translate the input API call +sequences into a set of stand-alone API calls defined in the +destination library using natural language features as the main +driver~\cite{pandita2015discovering, nguyen2016mapping}. Then we feed +the stand-alone API calls into a API sequence synthesizer called +\emph{Bayou}~\cite{murali2017neural} which in turn synthesizes a +complete sequence of API calls. We have designed a series of benchmark +problems to evaluate the accuracy of our API refactoring algorithm, +and here the accuracy is defined as the percentage of corrected +generated API calls. The results show that our algorithm is able to +refactor API call sequences accurately, given that the two involved +libraries have similar coding practices and the input sequence is not +rare in the training data. +``` diff --git a/documents/car/2024 LEXUS NX250 DOCUMENTS.md b/documents/car/lexus_nx250_2024_purchase.md similarity index 99% rename from documents/car/2024 LEXUS NX250 DOCUMENTS.md rename to documents/car/lexus_nx250_2024_purchase.md index 2147e0e..f4350b2 100644 --- a/documents/car/2024 LEXUS NX250 DOCUMENTS.md +++ b/documents/car/lexus_nx250_2024_purchase.md @@ -3,7 +3,7 @@ type: car-purchase-documents category: vehicle person: Yanxin Lu, Xuewei Jiang date: 2023-12-02 -source: "2024 LEXUS NX250 DOCUMENTS.pdf" +source: lexus_nx250_2024_purchase.pdf --- # 2024 Lexus NX 250 Purchase Documents diff --git a/documents/car/2024 LEXUS NX250 DOCUMENTS.pdf b/documents/car/lexus_nx250_2024_purchase.pdf similarity index 100% rename from documents/car/2024 LEXUS NX250 DOCUMENTS.pdf rename to documents/car/lexus_nx250_2024_purchase.pdf diff --git a/documents/dmv/jiang_2020/jiang-i797-h1b-2020.md b/documents/dmv/jiang_2020/jiang_i797_h1b_2020.md similarity index 99% rename from documents/dmv/jiang_2020/jiang-i797-h1b-2020.md rename to documents/dmv/jiang_2020/jiang_i797_h1b_2020.md index 980fe3a..f40fe7b 100644 --- a/documents/dmv/jiang_2020/jiang-i797-h1b-2020.md +++ b/documents/dmv/jiang_2020/jiang_i797_h1b_2020.md @@ -3,7 +3,7 @@ type: immigration document category: H1B approval notice person: Xuewei Jiang date: 2020-10-07 -source: jiang-i797-h1b-2020.pdf +source: jiang_i797_h1b_2020.pdf --- # I-797A Notice of Action - H1B Approval diff --git a/documents/dmv/jiang_2020/jiang-i797-h1b-2020.pdf b/documents/dmv/jiang_2020/jiang_i797_h1b_2020.pdf similarity index 100% rename from documents/dmv/jiang_2020/jiang-i797-h1b-2020.pdf rename to documents/dmv/jiang_2020/jiang_i797_h1b_2020.pdf diff --git a/documents/dmv/jiang_2020/jiang-passport.md b/documents/dmv/jiang_2020/jiang_passport.md similarity index 99% rename from documents/dmv/jiang_2020/jiang-passport.md rename to documents/dmv/jiang_2020/jiang_passport.md index c98ce89..c5af2b4 100644 --- a/documents/dmv/jiang_2020/jiang-passport.md +++ b/documents/dmv/jiang_2020/jiang_passport.md @@ -3,7 +3,7 @@ type: passport category: identity document person: Xuewei Jiang date: 2011-04-04 -source: jiang-passport.pdf +source: jiang_passport.pdf --- # Chinese Passport - Jiang Xuewei (Old) diff --git a/documents/dmv/jiang_2020/jiang-passport.pdf b/documents/dmv/jiang_2020/jiang_passport.pdf similarity index 100% rename from documents/dmv/jiang_2020/jiang-passport.pdf rename to documents/dmv/jiang_2020/jiang_passport.pdf diff --git a/documents/employment/usc/jiang_ead_2020.md b/documents/employment/usc/jiang_ead_2020.md new file mode 100644 index 0000000..ddcf023 --- /dev/null +++ b/documents/employment/usc/jiang_ead_2020.md @@ -0,0 +1,55 @@ +--- +type: immigration-document +category: employment +person: Xuewei Jiang +date: 2020 +provider: University of Southern California +source: jiang_ead_2020.pdf +--- + +# Employment Authorization Document (EAD) -- Jiang Xuewei + +## Front Side + +**UNITED STATES OF AMERICA -- EMPLOYMENT AUTHORIZATION** + +| Field | Value | +|-------|-------| +| Surname | JIANG | +| Given Name | XUEWEI | +| USCIS # | 116-564-749 | +| Country of Birth | China, People's Republic | +| Category | C03B | +| Card # | C03B YSC2090171865 | +| Terms and Conditions | Post-Completion Opt | +| Date of Birth | 13 MAR 1993 | +| Sex | F | +| Valid From | 06/01/20 | +| Card Expires | 05/31/21 | +| NOT VALID FOR REENTRY TO U.S. | (printed on card) | + +Note: "fingerprint not available" indicator present on card. + +Photo: Female, front-facing. + +## Back Side + +| Field | Value | +|-------|-------| +| FORM I-766 | Rev (02-2016) | +| Card Number | 36576677 | +| Rev | 121 | + +### Machine Readable Zone (MRZ) + +``` +IAUSA1165647493YSC2090171865<<3 +JIANG< **Note:** This file is a duplicate of the same image archived in `medical/covid/jiang_vaccine_3rd_shot.jpg`. It is retained here because it was used as a supporting document for China travel in 2021-2022. + +| Field | Value | +|-------|-------| +| Last Name | Jiang | +| First Name | Erica (Xuewei) | +| MI | — | +| Date of Birth | 03/13/1993 | +| Patient Number | — | + +## Vaccination Records + +| Dose | Vaccine | Product Name/Manufacturer | Lot Number | Date | Healthcare Professional or Clinic Site | +|------|---------|--------------------------|------------|------|---------------------------------------| +| 1st Dose COVID-19 | Pfizer | Pfizer | ER8729 | 4/6/21 | LAFD USC | +| 2nd Dose COVID-19 | Pfizer | Pfizer | EW 0171 | 4/27/21 | LAFD - USC | +| Other | Pfizer | Pfizer | FJ1620 | 11/24/21 | USC Pharmacy | +| Other | — | — | — | — | — | + +Scanned by TapScanner. diff --git a/documents/travel/china_2021/lu_vaccine_2nd_shot.jpg b/documents/travel/china_2021/lu_vaccine_2nd_shot.jpg new file mode 100644 index 0000000..ee7ebe1 Binary files /dev/null and b/documents/travel/china_2021/lu_vaccine_2nd_shot.jpg differ diff --git a/documents/travel/china_2021/lu_vaccine_2nd_shot.md b/documents/travel/china_2021/lu_vaccine_2nd_shot.md new file mode 100644 index 0000000..d8daabc --- /dev/null +++ b/documents/travel/china_2021/lu_vaccine_2nd_shot.md @@ -0,0 +1,48 @@ +--- +type: vaccine-record +category: travel +person: Yanxin Lu +date: 2021-05-06 +source: lu_vaccine_2nd_shot.jpg +--- + +# COVID-19 Vaccination Record Card — Lu Yanxin + +> **Note:** This file is a duplicate of the same image archived in `medical/covid/lu_vaccine_2nd_shot.jpg`. It is retained here because it was used as a supporting document for China travel in 2021-2022. + +Two cards are shown in this image (original and replacement/updated). + +## Card 1 (Top) + +| Field | Value | +|-------|-------| +| Last Name | Lu | +| First Name | Yanxin | +| MI | — | +| Date of Birth | 10/17/1989 | +| Patient Number | — | + +| Dose | Vaccine | Lot # | EXP | Date | Healthcare Professional or Clinic Site | +|------|---------|-------|-----|------|---------------------------------------| +| 1st Dose COVID-19 | PFIZER COVID-19 VACCINE | EW0161 | 07/31/2021 | 4/15/2021 | FORUM | +| 2nd Dose COVID-19 | — | — | — | — | — | +| Other | — | — | — | — | — | +| Other | — | — | — | — | — | + +## Card 2 (Bottom) + +| Field | Value | +|-------|-------| +| Last Name | Lu | +| First Name | Yanxin | +| MI | — | +| Date of Birth | 10-17-1989 | +| Patient Number | — | + +| Dose | Vaccine | Product Name/Manufacturer | Lot Number | Date | Healthcare Professional or Clinic Site | +|------|---------|--------------------------|------------|------|---------------------------------------| +| 1st Dose COVID-19 | Pfizer | Pfizer | EW0161 | 4/15/21 | Forum | +| 2nd Dose COVID-19 | PFIZER COVID-19 VACCINE | — | EW0173 | 5/6/2021 (EXP: 08/31/2021) | FORUM | +| Other | — | — | — | — | — | + +Scanned by TapScanner. diff --git a/documents/travel/china_2021/lu_vaccine_record_2022.jpg b/documents/travel/china_2021/lu_vaccine_record_2022.jpg new file mode 100644 index 0000000..86a93fc Binary files /dev/null and b/documents/travel/china_2021/lu_vaccine_record_2022.jpg differ diff --git a/documents/travel/china_2021/lu_vaccine_record_2022.md b/documents/travel/china_2021/lu_vaccine_record_2022.md new file mode 100644 index 0000000..3770c9c --- /dev/null +++ b/documents/travel/china_2021/lu_vaccine_record_2022.md @@ -0,0 +1,26 @@ +--- +type: vaccine-record +category: travel +person: Yanxin Lu +date: 2022-02-24 +source: lu_vaccine_record_2022.jpg +--- + +# SMART Health Card — Lu Yanxin (California Digital COVID-19 Vaccine Record) + +Screenshot from CA.GOV State of California digital vaccine record system. + +| Field | Value | +|-------|-------| +| Name | Yanxin Lu | +| DOB | 10/17/1989 | + +## Vaccination Records + +| Dose | Dose Date | Dose Type/Manufacturer | +|------|-----------|----------------------| +| 1 | 04/15/2021 | Pfizer | +| 2 | 05/06/2021 | Pfizer | +| 3 (Booster) | 02/24/2022 | Pfizer | + +The image includes a SMART Health Card QR code for digital verification. diff --git a/documents/travel/china_2021/test_result_jiang.md b/documents/travel/china_2021/test_result_jiang.md new file mode 100644 index 0000000..ef70506 --- /dev/null +++ b/documents/travel/china_2021/test_result_jiang.md @@ -0,0 +1,72 @@ +--- +type: test-result +category: travel +person: Xuewei Jiang +date: 2021-12-17 +source: test_result_jiang.pdf +--- + +# COVID-19 Test Result — Jiang Xuewei + +## Lab Information + +| Field | Value | +|-------|-------| +| Lab | Genentox Labs / Nova Diagnostics Labs | +| Address | 2001 E 1st St Ste 109, Santa Ana, CA 92705 | +| Phone | 833-668-2522 | +| CLIA# | 05D0871568 | +| Lab Director | Rosaura Williams MD | +| Email | results@novadxlabs.com | +| Report Type | Final Report — SARS RT-PCR FLY | + +## Patient Information + +| Field | Value | +|-------|-------| +| Name | Jiang, Xuewei | +| DOB | 03/13/1993 | +| Gender | Female | +| Address | 11950 Idaho Ave. Apt. 113, Los Angeles, CA 90025 | +| MRN | EJ4954380 | + +## Sample Information + +| Field | Value | +|-------|-------| +| Collected | 12/17/2021 11:02 | +| Received | 12/17/2021 | +| Reported | 12/17/2021 | + +## Clinic Information + +| Field | Value | +|-------|-------| +| Client | Chinese Consulate | +| Site | LAX Walk In | +| Physician | Maxwell Y Jen | + +## Detailed Results Summary + +| Specimen ID | Test | Specimen Type | Results | Expected Value | +|-------------|------|---------------|---------|----------------| +| 391192 | SARS CoV-2 (Covid-19) by RT-PCR (NAAT) | Nasopharyngeal Swab | Not Detected (negative) | Not Detected | +| 391193 | COVID-19 IgM (Chemiluminescence) | Venous Blood Draw | Detected (1.11 s/co) | <1 | +| 391194 | N-Protein | Blood Drop | Not Detected | Not Detected | + +**Resulted By:** Eyas Mousa +**Date:** 12/17/2021 + +**Final Result for SARS CoV-2 (COVID-19): NOT DETECTED (negative)** + +> Note: IgM was Detected (1.11 s/co) which is above the expected value of <1, but the overall final result was still NOT DETECTED (negative) based on the RT-PCR (NAAT) test. + +## Disclaimers + +Not Detected (negative) results do not preclude SARS-CoV-2 infection and should not be used as the sole basis for patient management decisions. Negative results must be combined with clinical observations, patient history, and epidemiological information. Collection of multiple specimens or types of specimens may be necessary to detect virus. Improper specimen collection and handling, sequence variability under primers/probes or viruses present below the limit of detection may lead to false negative results. Positive and negative predictive values of testing are highly dependent on prevalence. False negative test results are more likely when prevalence is high. + +This test has been authorized by the FDA under an Emergency Use Authorization (EUA). The test is only authorized for the duration of the declaration that circumstances exist justifying the authorization of emergency use of in vitro diagnostic tests for detection and/or diagnosis of SARS-CoV-2 under Section 564(b)(1) of the Act, 21 U.S.C. section 360bbb-3(b)(1), unless the authorization is terminated or revoked sooner. FDA review of the validation is pending. + +The SARS-CoV-2 test is intended for the qualitative detection of nucleic acids from SARS-CoV-2 in nasal, nasopharyngeal and oropharyngeal swab samples from patients who meet COVID-19 clinical and/or epidemiological criteria. Testing methodology is (Real Time) RT-PCR. The assay targets the S, N and ORF1ab genes. Test results must be correlated with clinical presentation and evaluated in the context of another laboratory and epidemiologic data. Test performance can be affected because the epidemiology and clinical spectrum of infection caused by SARS-CoV-2 is not fully understood. + +Genentox Laboratories, LLC. DBA Nova Diagnostics Labs CLIA Certification Number: 05D0871568 is certified under the Clinical Laboratory Improvement Amendments of 1988 (CLIA), 42 U.S.C. section 263a, to perform high complexity tests. diff --git a/documents/travel/china_2021/test_result_jiang.pdf b/documents/travel/china_2021/test_result_jiang.pdf new file mode 100644 index 0000000..4274ab4 Binary files /dev/null and b/documents/travel/china_2021/test_result_jiang.pdf differ diff --git a/documents/travel/china_2021/test_result_lu.md b/documents/travel/china_2021/test_result_lu.md new file mode 100644 index 0000000..e155a92 --- /dev/null +++ b/documents/travel/china_2021/test_result_lu.md @@ -0,0 +1,70 @@ +--- +type: test-result +category: travel +person: Yanxin Lu +date: 2021-12-17 +source: test_result_lu.pdf +--- + +# COVID-19 Test Result — Lu Yanxin + +## Lab Information + +| Field | Value | +|-------|-------| +| Lab | Genentox Labs / Nova Diagnostics Labs | +| Address | 2001 E 1st St Ste 109, Santa Ana, CA 92705 | +| Phone | 833-668-2522 | +| CLIA# | 05D0871568 | +| Lab Director | Rosaura Williams MD | +| Email | results@novadxlabs.com | +| Report Type | Final Report — SARS RT-PCR FLY | + +## Patient Information + +| Field | Value | +|-------|-------| +| Name | Lu, Yanxin | +| DOB | 10/17/1989 | +| Gender | Male | +| Address | 11950 Idaho Ave. Apt. 113, Los Angeles, CA 90025 | +| MRN | E93603635 | + +## Sample Information + +| Field | Value | +|-------|-------| +| Collected | 12/17/2021 11:53 | +| Received | 12/17/2021 | +| Reported | 12/17/2021 | + +## Clinic Information + +| Field | Value | +|-------|-------| +| Client | Chinese Consulate | +| Site | LAX Walk In | +| Physician | Maxwell Y Jen | + +## Detailed Results Summary + +| Specimen ID | Test | Specimen Type | Results | Expected Value | +|-------------|------|---------------|---------|----------------| +| 391449 | SARS CoV-2 (Covid-19) by RT-PCR (NAAT) | Nasopharyngeal Swab | Not Detected (negative) | Not Detected | +| 391450 | COVID-19 IgM (Chemiluminescence) | Venous Blood Draw | Not Detected (negative) (0 s/co) | <1 | +| 391451 | N-Protein | Blood Drop | Not Detected | Not Detected | + +**Resulted By:** Eyas Mousa +**Date:** 12/17/2021 + +**Final Result for SARS CoV-2 (COVID-19): NOT DETECTED (negative)** + +## Disclaimers + +Not Detected (negative) results do not preclude SARS-CoV-2 infection and should not be used as the sole basis for patient management decisions. Negative results must be combined with clinical observations, patient history, and epidemiological information. Collection of multiple specimens or types of specimens may be necessary to detect virus. Improper specimen collection and handling, sequence variability under primers/probes or viruses present below the limit of detection may lead to false negative results. Positive and negative predictive values of testing are highly dependent on prevalence. False negative test results are more likely when prevalence is high. + +This test has been authorized by the FDA under an Emergency Use Authorization (EUA). The test is only authorized for the duration of the declaration that circumstances exist justifying the authorization of emergency use of in vitro diagnostic tests for detection and/or diagnosis of SARS-CoV-2 under Section 564(b)(1) of the Act, 21 U.S.C. section 360bbb-3(b)(1), unless the authorization is terminated or revoked sooner. FDA review of the validation is pending. + +The SARS-CoV-2 test is intended for the qualitative detection of nucleic acids from SARS-CoV-2 in nasal, nasopharyngeal and oropharyngeal swab samples from patients who meet COVID-19 clinical and/or epidemiological criteria. Testing methodology is (Real Time) RT-PCR. The assay targets the S, N and ORF1ab genes. Test results must be correlated with clinical presentation and evaluated in the context of another laboratory and epidemiologic data. Test performance can be affected because the epidemiology and clinical spectrum of infection caused by SARS-CoV-2 is not fully understood. + +Genentox Laboratories, LLC. DBA Nova Diagnostics Labs CLIA Certification Number: 05D0871568 is certified under the Clinical Laboratory Improvement Amendments of 1988 (CLIA), 42 U.S.C. section 263a, to perform high complexity tests. diff --git a/documents/travel/china_2021/test_result_lu.pdf b/documents/travel/china_2021/test_result_lu.pdf new file mode 100644 index 0000000..5d71a52 Binary files /dev/null and b/documents/travel/china_2021/test_result_lu.pdf differ diff --git a/documents/travel/china_2021/新冠疫苗接种声明书_jiang.jpg b/documents/travel/china_2021/新冠疫苗接种声明书_jiang.jpg new file mode 100644 index 0000000..0b0632a Binary files /dev/null and b/documents/travel/china_2021/新冠疫苗接种声明书_jiang.jpg differ diff --git a/documents/travel/china_2021/新冠疫苗接种声明书_jiang.md b/documents/travel/china_2021/新冠疫苗接种声明书_jiang.md new file mode 100644 index 0000000..9d4bde1 --- /dev/null +++ b/documents/travel/china_2021/新冠疫苗接种声明书_jiang.md @@ -0,0 +1,37 @@ +--- +type: vaccine-declaration +category: travel +person: Xuewei Jiang +date: 2021-12-16 +source: 新冠疫苗接种声明书_jiang.jpg +--- + +# 新冠疫苗接种声明书 / Letter of Commitment on COVID-19 Vaccination — Jiang Xuewei (Signed Copy) + +This is a scanned/photographed copy of the signed declaration form. The PDF version (疫苗接种声明书_jiang.pdf) contains the same content. This JPG shows the completed form with handwritten signature. + +## Declarant Information + +| Field | Value | +|-------|-------| +| Name (声明人姓名) | 姜薛伟 | +| Gender (性别) | 女 (Female) | +| Date of Birth (出生日期) | 1993年3月13日 | +| Passport No. (护照号) | EJ4954380 | +| Telephone (电话) | +12542149350 | +| Email (电邮) | xueweijiang0313@gmail.com | + +## Vaccination Details + +| Field | Value | +|-------|-------| +| Vaccine brand name (疫苗品牌名称) | 辉瑞 (Pfizer) | +| Vaccination institution (接种机构名称) | University of Southern California | +| Address (接种机构地址) | 3701 Flower St, Los Angeles, CA 90007 | +| Contact (接种机构联系方式) | 213-740-9355 | +| Doses (剂次) | Two doses (二剂次) checked | +| First dose date (第一剂接种日期) | 2021年4月6日 | +| Second dose date (第二剂接种日期) | 2021年4月27日 | + +**Signature:** (handwritten signature present) +**Date:** 2021年12月16日 diff --git a/documents/travel/china_2021/新冠疫苗接种声明书_lu.jpg b/documents/travel/china_2021/新冠疫苗接种声明书_lu.jpg new file mode 100644 index 0000000..add441a Binary files /dev/null and b/documents/travel/china_2021/新冠疫苗接种声明书_lu.jpg differ diff --git a/documents/travel/china_2021/新冠疫苗接种声明书_lu.md b/documents/travel/china_2021/新冠疫苗接种声明书_lu.md new file mode 100644 index 0000000..68cb7ed --- /dev/null +++ b/documents/travel/china_2021/新冠疫苗接种声明书_lu.md @@ -0,0 +1,37 @@ +--- +type: vaccine-declaration +category: travel +person: Yanxin Lu +date: 2021-12-16 +source: 新冠疫苗接种声明书_lu.jpg +--- + +# 新冠疫苗接种声明书 / Letter of Commitment on COVID-19 Vaccination — Lu Yanxin (Signed Copy) + +This is a scanned/photographed copy of the signed declaration form. The PDF version (疫苗接种声明书_lu.pdf) contains the same content. This JPG shows the completed form with handwritten signature. + +## Declarant Information + +| Field | Value | +|-------|-------| +| Name (声明人姓名) | 陆彦忻 | +| Gender (性别) | 男 (Male) | +| Date of Birth (出生日期) | 1989年10月17日 | +| Passport No. (护照号) | E93603635 | +| Telephone (电话) | +12542241457 | +| Email (电邮) | crac1017@gmail.com | + +## Vaccination Details + +| Field | Value | +|-------|-------| +| Vaccine brand name (疫苗品牌名称) | 辉瑞 (Pfizer) | +| Vaccination institution (接种机构名称) | Forum Stadium | +| Address (接种机构地址) | 3900 W Manchester Blvd, Inglewood, CA, 90305 | +| Contact (接种机构联系方式) | 833-540-0473 | +| Doses (剂次) | Two doses (二剂次) checked | +| First dose date (第一剂接种日期) | 2021年4月15日 | +| Second dose date (第二剂接种日期) | 2021年5月6日 | + +**Signature:** (handwritten signature present) +**Date:** 2021年12月16日 diff --git a/documents/travel/china_2021/疫苗接种声明书_jiang.md b/documents/travel/china_2021/疫苗接种声明书_jiang.md new file mode 100644 index 0000000..b85291c --- /dev/null +++ b/documents/travel/china_2021/疫苗接种声明书_jiang.md @@ -0,0 +1,48 @@ +--- +type: vaccine-declaration +category: travel +person: Xuewei Jiang +date: 2021-12-16 +source: 疫苗接种声明书_jiang.pdf +--- + +# 新冠疫苗接种声明书 / Letter of Commitment on COVID-19 Vaccination — Jiang Xuewei + +## Declarant Information + +| Field (中文) | Field (English) | Value | +|------------|-----------------|-------| +| 声明人姓名 | Name | 姜薛伟 | +| 性别 | Gender | 女 (Female) | +| 出生日期 | Date of Birth | 1993年3月13日 | +| 护照号 | Passport No. | EJ4954380 | +| 电话 | Telephone | +12542149350 | +| 电邮 | Email | xueweijiang0313@gmail.com | + +## Statement (声明内容) + +1. 本人已接种新冠疫苗,接种详情如下 / I have received COVID-19 vaccination and the details are as follows: + +| # | Field (中文) | Field (English) | Value | +|---|-----------|-----------------|-------| +| 1 | 疫苗品牌名称 | Vaccine brand name | 辉瑞 (Pfizer) | +| 2 | 接种机构名称 | Name of vaccination institution | University of Southern California | +| 3 | 接种机构地址 | Address of vaccination institution | 3701 Flower St, Los Angeles, CA 90007 | +| 4 | 接种机构联系方式 | Contact information | 213-740-9355 | +| 5 | 疫苗接种剂次 | Doses | 二剂次 / Two doses (checked) | + +### Vaccination Dates + +| Dose | Date | +|------|------| +| 第一剂 / First dose | 2021年4月6日 | +| 第二剂 / Second dose | 2021年4月27日 | + +2. 本人所附疫苗接种凭证(接种卡或其它接种证明)真实无误。 +I hereby declare that the attached vaccination certificate (vaccination card or other forms of certification) is true and accurate. + +本人保证以上所有内容真实,并愿意承担由此引起的一切法律责任,包括但不限于因虚报、瞒报导致被限制去中国旅行或被追究法律责任等后果。 +I hereby declare that the information provided above is true, and I shall bear all legal responsibilities arising therefrom, including but not limited to restricted travel to China, punishment by law, or other consequences in the case of partial or false disclosures. + +**声明人签名 / Signature:** (signed) +**日期 / Date:** 2021年12月16日 diff --git a/documents/travel/china_2021/疫苗接种声明书_jiang.pdf b/documents/travel/china_2021/疫苗接种声明书_jiang.pdf new file mode 100644 index 0000000..4ef6050 Binary files /dev/null and b/documents/travel/china_2021/疫苗接种声明书_jiang.pdf differ diff --git a/documents/travel/china_2021/疫苗接种声明书_lu.md b/documents/travel/china_2021/疫苗接种声明书_lu.md new file mode 100644 index 0000000..ca680b7 --- /dev/null +++ b/documents/travel/china_2021/疫苗接种声明书_lu.md @@ -0,0 +1,48 @@ +--- +type: vaccine-declaration +category: travel +person: Yanxin Lu +date: 2021-12-16 +source: 疫苗接种声明书_lu.pdf +--- + +# 新冠疫苗接种声明书 / Letter of Commitment on COVID-19 Vaccination — Lu Yanxin + +## Declarant Information + +| Field (中文) | Field (English) | Value | +|------------|-----------------|-------| +| 声明人姓名 | Name | 陆彦忻 | +| 性别 | Gender | 男 (Male) | +| 出生日期 | Date of Birth | 1989年10月17日 | +| 护照号 | Passport No. | E93603635 | +| 电话 | Telephone | +12542241457 | +| 电邮 | Email | crac1017@gmail.com | + +## Statement (声明内容) + +1. 本人已接种新冠疫苗,接种详情如下 / I have received COVID-19 vaccination and the details are as follows: + +| # | Field (中文) | Field (English) | Value | +|---|-----------|-----------------|-------| +| 1 | 疫苗品牌名称 | Vaccine brand name | 辉瑞 (Pfizer) | +| 2 | 接种机构名称 | Name of vaccination institution | Forum Stadium | +| 3 | 接种机构地址 | Address of vaccination institution | 3900 W Manchester Blvd, Inglewood, CA, 90305 | +| 4 | 接种机构联系方式 | Contact information | 833-540-0473 | +| 5 | 疫苗接种剂次 | Doses | 二剂次 / Two doses (checked) | + +### Vaccination Dates + +| Dose | Date | +|------|------| +| 第一剂 / First dose | 2021年4月15日 | +| 第二剂 / Second dose | 2021年5月6日 | + +2. 本人所附疫苗接种凭证(接种卡或其它接种证明)真实无误。 +I hereby declare that the attached vaccination certificate (vaccination card or other forms of certification) is true and accurate. + +本人保证以上所有内容真实,并愿意承担由此引起的一切法律责任,包括但不限于因虚报、瞒报导致被限制去中国旅行或被追究法律责任等后果。 +I hereby declare that the information provided above is true, and I shall bear all legal responsibilities arising therefrom, including but not limited to restricted travel to China, punishment by law, or other consequences in the case of partial or false disclosures. + +**声明人签名 / Signature:** (signed) +**日期 / Date:** 2021年12月16日 diff --git a/documents/travel/china_2021/疫苗接种声明书_lu.pdf b/documents/travel/china_2021/疫苗接种声明书_lu.pdf new file mode 100644 index 0000000..151d777 Binary files /dev/null and b/documents/travel/china_2021/疫苗接种声明书_lu.pdf differ diff --git a/documents/travel/fengqin_xue_us_visit_2023/fengqin_cbp_invitation.md b/documents/travel/fengqin_xue_us_visit_2023/fengqin_cbp_invitation.md new file mode 100644 index 0000000..0d55c46 --- /dev/null +++ b/documents/travel/fengqin_xue_us_visit_2023/fengqin_cbp_invitation.md @@ -0,0 +1,23 @@ +--- +type: invitation-letter +category: travel +person: Fengqin Xue +date: 2023-11-01 +source: fengqin_cbp_invitation.pdf +--- + +# CBP Invitation Letter for Fengqin Xue — 2023 US Visit + +Dear Officer, + +My name is Xuewei Jiang. I am an H1B visa holder living in Los Angeles working at USC Marshall School of Business. I am writing to invite my mother, Fengqin Xue, to come to the US to visit me and my husband. My mother is a resident of Jinan, China and she is planning to visit from 11/15/2023 till 01/02/2024. + +During her visit, my mother will be staying with me at 12421 Sanford St, Los Angeles, CA. Occasionally, my husband and I will take her on a tour around LA. We will be responsible for her accommodation, transportation, and other travel expenses. + +Please do not hesitate to contact me if you have any questions or require further information. + +Thank you for your consideration. + +Sincerely, + +Xuewei Jiang diff --git a/documents/travel/fengqin_xue_us_visit_2023/fengqin_cbp_invitation.pdf b/documents/travel/fengqin_xue_us_visit_2023/fengqin_cbp_invitation.pdf new file mode 100644 index 0000000..1879fe7 Binary files /dev/null and b/documents/travel/fengqin_xue_us_visit_2023/fengqin_cbp_invitation.pdf differ diff --git a/documents/travel/fengqin_xue_us_visit_2023/fengqin_xue_evus.md b/documents/travel/fengqin_xue_us_visit_2023/fengqin_xue_evus.md new file mode 100644 index 0000000..3fe74db --- /dev/null +++ b/documents/travel/fengqin_xue_us_visit_2023/fengqin_xue_evus.md @@ -0,0 +1,160 @@ +--- +type: evus-enrollment +category: travel +person: Fengqin Xue +date: 2023-11-01 +source: fengqin_xue_evus.pdf +--- + +# EVUS Enrollment — Fengqin Xue (2023) + +**Status: ENROLLED** + +An EVUS status of "Enrolled" means the EVUS requirement has been met for travel to the United States 1) for a validity period of two years from the date of enrollment, OR 2) until the Primary passport expires, OR 3) until the visa expires, whichever occurs first. + +## Enrollment Summary + +| Field | Value | +|-------|-------| +| Enrollment Number | OTER1U9X5N8A2S3L | +| Expires | February 4, 2025 | + +## Travel Document Information + +| Field | Value | +|-------|-------| +| Surname | XUE | +| Given Name | FENGQIN | +| Date of Birth | April 6, 1965 | + +## 10 Year U.S. Visa Information + +| Field | Value | +|-------|-------| +| 10 year U.S. Visa for business or pleasure? | Yes | +| B1/B2 Visa Foil Number | J3864872 | + +## Passport You Are Traveling With + +| Field | Value | +|-------|-------| +| Passport Number | EJ6254567 | +| Passport Country of Issuance | CHINA (CHN) | +| Passport Issue Date | December 12, 2022 | +| Passport Expiration Date | December 11, 2032 | +| Does the passport contain your U.S. Visa? | No | + +## Passport With U.S. Visa + +| Field | Value | +|-------|-------| +| Passport Number | G49056250 | +| Passport Country of Issuance | CHINA (CHN) | +| Passport Issue Date | March 4, 2011 | +| Passport Expiration Date | March 3, 2021 | +| Surname | XUE | +| Given Name | FENGQIN | +| Date of Birth | April 6, 1965 | + +## Enrollee / Passport Information + +| Field | Value | +|-------|-------| +| Surname in Native Language | 薛 | +| First (Given) Name in Native Language | 丰芹 | +| Gender | Female | +| City of Birth | HAIYANG | +| Country of Birth | CHINA (CHN) | +| Country of Citizenship | CHINA (CHN) | +| National Identification Number | 370105196504060823 | +| Home Address Line 1 | HUANGTAI SOUTH ROAD #79 | +| Apartment Number | 1-1-401 | +| City | JINAN | +| State/Province/Region | SHANDONG | +| Country | CHINA (CHN) | +| Home Address in Native Language | 山东省济南市天桥区黄台南路79号院1号楼1单元401室 | +| Email Address | XUEWEIJIANG0313@GMAIL.COM | +| Phone Type | Cell | +| Country Code | CHINA (CHN) (+86) | +| Phone Number | 13583102904 | + +## Aliases + +Are you known by any other names or aliases? No + +## Other Citizenship / Nationality + +- Are you now, a citizen or national of any other country? No +- Have you ever been a citizen or national of any other country? No +- Have you ever been issued a passport or national identity card for travel by any other country? No + +## GE Membership + +Are you a member of the CBP Global Entry Program? No + +## Parents + +| Relationship | Surname | First (Given) Name | +|-------------|---------|-------------------| +| Father | XUE | JINGYUN | +| Mother | YANG | GUIYING | + +## Employment Information + +| Field | Value | +|-------|-------| +| Current or previous employer? | Yes | +| Job Title | SENIOR ENGINEER | +| Employer Name | CHINA RAILWAY NO.10 ENGINEERING GROUP | +| Employer Name in Native Language | 中铁十局集团有限公司 | +| Address Line 1 | SHUNTAI SQUARE #7 | +| City | JINAN | +| State/Province/Region | SHANDONG | +| Country | CHINA (CHN) | + +## Travel Information + +| Field | Value | +|-------|-------| +| Travel to the U.S. occurring in transit to another country? | No | + +## U.S. Point of Contact Information + +| Field | Value | +|-------|-------| +| Name | XUEWEI JIANG | +| Address Line 1 | 12421 SANFORD ST | +| City | LOS ANGELES | +| State/Province/Region | CALIFORNIA | +| Country Code | UNITED STATES (USA) (+1) | +| Phone Number | 2542149350 | + +## Address While in the U.S. + +| Field | Value | +|-------|-------| +| Address Line 1 | 12421 SANFORD ST | +| City | LOS ANGELES | +| State/Province/Region | CALIFORNIA | + +## Emergency Contact Information + +| Field | Value | +|-------|-------| +| Surname | JIANG | +| First (Given) Name | XUEWEI | +| Email Address | XUEWEIJIANG0313@GMAIL.COM | +| Country Code | UNITED STATES (USA) (+1) | +| Phone Number | 2542149350 | + +## Eligibility Questions + +1. Physical or mental disorder / communicable diseases? **No** +2. Arrested or convicted for a crime? **No** +3. Violated any law related to illegal drugs? **No** +4. Engaged in terrorist activities, espionage, sabotage, or genocide? **No** +5. Committed fraud or misrepresented yourself? **No** +6. Seeking employment in the United States? **No** +7. Denied a U.S. visa or refused admission? **No** +8. Stayed longer than the admission period? **No** +9. Traveled to Iran, Iraq, Libya, North Korea, Somalia, Sudan, Syria or Yemen on or after March 1, 2011? **No** diff --git a/documents/travel/fengqin_xue_us_visit_2023/fengqin_xue_evus.pdf b/documents/travel/fengqin_xue_us_visit_2023/fengqin_xue_evus.pdf new file mode 100644 index 0000000..e0437d6 Binary files /dev/null and b/documents/travel/fengqin_xue_us_visit_2023/fengqin_xue_evus.pdf differ diff --git a/documents/travel/fengqin_xue_us_visit_2025/fengqin_cbp_invitation_2025.md b/documents/travel/fengqin_xue_us_visit_2025/fengqin_cbp_invitation_2025.md new file mode 100644 index 0000000..536ddcc --- /dev/null +++ b/documents/travel/fengqin_xue_us_visit_2025/fengqin_cbp_invitation_2025.md @@ -0,0 +1,23 @@ +--- +type: invitation-letter +category: travel +person: Fengqin Xue +date: 2025-02-10 +source: fengqin_cbp_invitation_2025.pdf +--- + +# CBP Invitation Letter for Fengqin Xue — 2025 US Visit + +Dear Officer, + +My name is Xuewei Jiang. I am a lawful permanent resident living in Los Angeles working at USC Marshall School of Business. I am writing to invite my mother, Fengqin Xue, to come to the US to visit me and my husband. My mother is a resident of Jinan, China and she is planning to visit from 02/16/2025 till 03/29/2025. + +During her visit, my mother will be staying with me at 12421 Sanford St, Los Angeles, CA. Occasionally, my husband and I will take her on a tour around LA. We will be responsible for her accommodation, transportation, and other travel expenses. + +Please do not hesitate to contact me if you have any questions or require further information. + +Thank you for your consideration. + +Sincerely, + +Xuewei Jiang diff --git a/documents/travel/fengqin_xue_us_visit_2025/fengqin_cbp_invitation_2025.pdf b/documents/travel/fengqin_xue_us_visit_2025/fengqin_cbp_invitation_2025.pdf new file mode 100644 index 0000000..74c0e29 Binary files /dev/null and b/documents/travel/fengqin_xue_us_visit_2025/fengqin_cbp_invitation_2025.pdf differ diff --git a/documents/travel/fengqin_xue_us_visit_2025/xue_evus_2025.md b/documents/travel/fengqin_xue_us_visit_2025/xue_evus_2025.md new file mode 100644 index 0000000..e9781cb --- /dev/null +++ b/documents/travel/fengqin_xue_us_visit_2025/xue_evus_2025.md @@ -0,0 +1,148 @@ +--- +type: evus-enrollment +category: travel +person: Fengqin Xue +date: 2025-02-09 +source: xue_evus_2025.pdf +--- + +# EVUS Enrollment — Fengqin Xue (2025, February) + +**Status: ENROLLED** + +An EVUS status of "Enrolled" means the EVUS requirement has been met for travel to the United States 1) for a validity period of two years from the date of enrollment, OR 2) until the Primary passport expires, OR 3) until the visa expires, whichever occurs first. + +## Enrollment Summary + +| Field | Value | +|-------|-------| +| Enrollment Number | RHB17TZU6NBHEELJ | +| Expires | February 4, 2027 | + +## Travel Document Information + +| Field | Value | +|-------|-------| +| Surname | XUE | +| Given Name | FENGQIN | +| Date of Birth | April 6, 1965 | + +## 10 Year U.S. Visa Information + +| Field | Value | +|-------|-------| +| 10 year U.S. Visa for business or pleasure? | Yes | +| B1/B2 Visa Foil Number | V2557424 | + +## Passport You Are Traveling With + +| Field | Value | +|-------|-------| +| Passport Number | EJ6254567 | +| Passport Country of Issuance | CHINA (CHN) | +| Passport Issue Date | December 12, 2022 | +| Passport Expiration Date | December 11, 2032 | +| Does the passport contain your U.S. Visa? | Yes | + +## Enrollee / Passport Information + +| Field | Value | +|-------|-------| +| Surname in Native Language | 薛 | +| First (Given) Name in Native Language | 丰芹 | +| Sex | Female | +| City of Birth | JINAN | +| Country of Birth | CHINA (CHN) | +| Country of Citizenship | CHINA (CHN) | +| National Identification Number | 370105196504060823 | +| Home Address Line 1 | HUANGTAI SOUTH ROAD #79 | +| Home Address Line 2 | 1-1-401 | +| City | JINAN | +| State/Province/Region | SHANDONG | +| Country | CHINA (CHN) | +| Home Address in Native Language | 山东省济南市黄台南路79号院1号楼1单元401室 | +| Email Address | XUEWEIJIANG0313@GMAIL.COM | +| Phone Type | Cell | +| Country Code | CHINA (CHN) (+86) | +| Phone Number | 13583102904 | + +## Aliases + +Are you known by any other names or aliases? No + +## Other Citizenship / Nationality + +- Are you now, a citizen or national of any other country? No +- Have you ever been a citizen or national of any other country? No +- Have you ever been issued a passport or national identity card for travel by any other country? No + +## GE Membership + +Are you a member of the CBP Global Entry Program? No + +## Parents + +| Relationship | Surname | First (Given) Name | +|-------------|---------|-------------------| +| Father | XUE | JINGYUN | +| Mother | YANG | GUIYING | + +## Employment Information + +| Field | Value | +|-------|-------| +| Current or previous employer? | Yes | +| Employer Name | CHINA RAILWAY NO. 10 ENGINEERING GROUP | +| Employer Name in Native Language | 中铁十局 | +| Address Line 1 | SHUNHUA RD #2000 | +| Address Line 2 | BUILDING #7 | +| City | JINAN | +| State/Province/Region | SHANDONG | +| Country | CHINA (CHN) | + +## Travel Information + +| Field | Value | +|-------|-------| +| Travel to the U.S. occurring in transit to another country? | No | + +## U.S. Point of Contact Information + +| Field | Value | +|-------|-------| +| Name | XUEWEI JIANG | +| Address Line 1 | 12421 SANFORD ST | +| City | LOS ANGELES | +| State/Province/Region | CALIFORNIA | +| Country Code | UNITED STATES (USA) (+1) | +| Phone Number | 2542149350 | + +## Address While in the U.S. + +| Field | Value | +|-------|-------| +| Address Line 1 | 12421 SANFORD ST | +| City | LOS ANGELES | +| State/Province/Region | CALIFORNIA | + +## Emergency Contact Information + +| Field | Value | +|-------|-------| +| Surname | JIANG | +| First (Given) Name | XUEWEI | +| Email Address | XUEWEIJIANG0313@GMAIL.COM | +| Country Code | UNITED STATES (USA) (+1) | +| Phone Number | 2542149350 | + +## Eligibility Questions + +1. Physical or mental disorder / communicable diseases? **No** +2. Arrested or convicted for a crime? **No** +3. Violated any law related to illegal drugs? **No** +4. Engaged in terrorist activities, espionage, sabotage, or genocide? **No** +5. Committed fraud or misrepresented yourself? **No** +6. Seeking employment in the United States? **No** +7. Denied a U.S. visa or refused admission? **No** +8. Stayed longer than the admission period? **No** +9. Traveled to Iran, Iraq, Libya, North Korea, Somalia, Sudan, Syria or Yemen on or after March 1, 2011? **No** diff --git a/documents/travel/fengqin_xue_us_visit_2025/xue_evus_2025.pdf b/documents/travel/fengqin_xue_us_visit_2025/xue_evus_2025.pdf new file mode 100644 index 0000000..8a67b1a Binary files /dev/null and b/documents/travel/fengqin_xue_us_visit_2025/xue_evus_2025.pdf differ diff --git a/documents/travel/fengqin_xue_us_visit_2025/xue_evus_2025_09.md b/documents/travel/fengqin_xue_us_visit_2025/xue_evus_2025_09.md new file mode 100644 index 0000000..46bb332 --- /dev/null +++ b/documents/travel/fengqin_xue_us_visit_2025/xue_evus_2025_09.md @@ -0,0 +1,28 @@ +--- +type: evus-enrollment +category: travel +person: Fengqin Xue +date: 2025-09-07 +source: xue_evus_2025_09.pdf +--- + +# EVUS Enrollment — Fengqin Xue (2025, September) + +**Status: PENDING** + +Your EVUS enrollment is under review because an immediate determination could not be made. This response does not indicate negative findings. A determination will be available within 72 hours. + +## Enrollment Summary + +| Field | Value | +|-------|-------| +| Name | FENGQIN XUE | +| Date of Birth | Apr 6, 1965 | +| Enrollment Number | OMXRABLQQONAK78I | +| Passport Number | EJ6254567 | +| B1/B2 Visa Foil Number | V2557424 | +| Status | Pending | +| Expires | N/A | + +Printed from: Official EVUS Enrollment Website, U.S. Customs and Border Protection (https://www.evus.gov/print/evusStatus) +Date: 9/7/25, 11:02 AM diff --git a/documents/travel/fengqin_xue_us_visit_2025/xue_evus_2025_09.pdf b/documents/travel/fengqin_xue_us_visit_2025/xue_evus_2025_09.pdf new file mode 100644 index 0000000..034dc6a Binary files /dev/null and b/documents/travel/fengqin_xue_us_visit_2025/xue_evus_2025_09.pdf differ diff --git a/documents/travel/fengqin_xue_us_visit_2025/xue_passport_2022.jpg b/documents/travel/fengqin_xue_us_visit_2025/xue_passport_2022.jpg new file mode 100644 index 0000000..362950d Binary files /dev/null and b/documents/travel/fengqin_xue_us_visit_2025/xue_passport_2022.jpg differ diff --git a/documents/travel/fengqin_xue_us_visit_2025/xue_passport_2022.md b/documents/travel/fengqin_xue_us_visit_2025/xue_passport_2022.md new file mode 100644 index 0000000..b8b66f5 --- /dev/null +++ b/documents/travel/fengqin_xue_us_visit_2025/xue_passport_2022.md @@ -0,0 +1,39 @@ +--- +type: passport +category: travel +person: Fengqin Xue +date: 2022-12-12 +source: xue_passport_2022.jpg +--- + +# Chinese Passport — Xue Fengqin (薛丰芹) + +Photo of passport bio page from People's Republic of China. + +## Passport Details + +| Field | Value | +|-------|-------| +| Type | P (Passport) | +| Country Code | CHN | +| Passport No. | EJ6254567 | +| Surname (姓) | XUE | +| Given Names (名) | FENGQIN | +| Chinese Name | 薛丰芹 | +| Sex | F | +| Nationality | CHINESE | +| Date of Birth | 06 APR 1965 | +| Place of Birth | 山东 / SHANDONG | +| Date of Issue | 12/DEC/2022 | +| Date of Expiry | 11/DEC/2032 | +| Place of Issue | 山东 / SHANDONG | +| Authority | 中华人民共和国国家移民管理局 / National Immigration Administration PRC | + +## Machine Readable Zone (MRZ) + +``` +P